robots.txt checker in PHP
This sample code will help you to Check whether crawling is allowed in robots.txt.The function will return 1 if you are allowed to take data and will return 0 otherwise. It will help you if you need to take data from a website. We need to check whether the crawling is allowed on that site or not..
<?php
function robots_allowed($url){
$current_url=$url;
$xmp=explode("/", $current_url."/");
$robotsdomain=trim("http://".$xmp[2]);
$stipped_robotsdomain=str_replace("/","",$robotsdomain);
$stripped_current_url=str_replace("/", "" ,$url);
$my_user_agent="User-agent: *"; //my useragent
$robots=Read_Content($robotsdomain.'/robots.txt');
$robots=explode("\n",$robots);
for ($i=0;$i<sizeof($robots);$i++){
if (trim($robots[$i])==$my_user_agent){ // rules for agent: *
for ($checkrules=1;$checkrules<10;$checkrules++){
if (trim($robots[$i+$checkrules])!=""){
$pos = strpos( $current_line[$count],"User-agent");
if (is_integer($pos)) break;
$pos = strpos( $current_line[$count],"#");
if (is_integer($pos)) $current_line[$count]=substr($current_line[$count],0,$pos);
$disallow_line=str_replace("Disallow: ", "" ,$robots[$i+$checkrules]);
//$disallow_line=str_replace("http://", "" ,$disallow_line);
$disallow_line=str_replace("/", "" ,$disallow_line);
$newdata[$num]=$stipped_robotsdomain.$disallow_line;
$num++;
$count++;
}
}
}
}
$forbidden=1;
for ($last=0;$last<20;$last++){
if (trim($newdata[$last])!=""){
if (preg_match("/".trim($newdata[$last])."/i",$stripped_current_url)) {$forbidden=0;}
}
}
return $forbidden;
}
Function Read_Content($url){// Open een url return content
$handle=@fopen($url,"r");
if($handle){
$contents = fread ($handle, 10000);
fclose($handle);
}
return $contents;
}
?>
//**********************Usage*************************
$result=robots_allowed("www.samplephpcodes.com");




