php google scraper problem ..need hel
ini_set("max_execution_time", 0); set_time_limit(0); // no time-outs!if(isset($_GET['q'])) $query=$_GET['q']; else $query=$key; if(isset($_GET['pages'])) $npages=$_GET['pages']; else $npages=10; if(isset($_GET['start'])) $start=$_GET['start']; else $start=0; if($npages>=100) $npages=100; $gg_url = 'http://www.google.com/search?hl=en&q=' . urlencode($query) . '&start='; $i=1; $size=0; $options = array( CURLOPT_PROXY => "193.138.185.51", //Proxy host if needed CURLOPT_PROXYPORT => "3128", //Proxy port if needed CURLOPT_RETURNTRANSFER => true, // return web page CURLOPT_HEADER => false, // don't return headers CURLOPT_FOLLOWLOCATION => true, // follow redirects CURLOPT_ENCODING => "", // handle all encodings CURLOPT_AUTOREFERER => true, // set referer on redirect CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect CURLOPT_TIMEOUT => 120, // timeout on response CURLOPT_MAXREDIRS => 10, // stop after 10 redirects CURLOPT_COOKIEFILE => "cookie.txt", CURLOPT_COOKIEJAR => "cookie.txt", CURLOPT_USERAGENT => "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3", CURLOPT_REFERER => "http://www.google.com/", ); for ($page = $start; $page < $npages; $page++) { $ch = curl_init($gg_url.$page.'0'); curl_setopt_array($ch,$options); $scraped=""; $scraped.=curl_exec($ch); curl_close( $ch ); $results = array(); preg_match_all('/a href="([^"]+)" class=l.+?>.+?<\/a>/',$scraped,$results); foreach ($results[1] as $url) { echo "$url,"; $i++; } $size+=strlen($scraped); } |
CURLOPT_PROXY => "193.138.185.51", //Proxy host if needed
CURLOPT_PROXYPORT => "3128", //Proxy port if needed
-
Anthoni -
Thanks
{{ DiscussionBoard.errors[5441885].message }} -