file_get_contents script works with some websites but not others

前端 未结 3 1420
轻奢々
轻奢々 2021-01-01 04:24

I\'m looking to build a PHP script that parses HTML for particular tags. I\'ve been using this code block, adapted from this tutorial:



        
相关标签:
3条回答
  • 2021-01-01 05:04
    $html = file_get_html('http://google.com/');
    $title = $html->find('title')->innertext;
    

    Or if you prefer with preg_match and you should be really using cURL instead of fgc...

    function curl($url){
    
        $headers[]  = "User-Agent:Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13";
        $headers[]  = "Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
        $headers[]  = "Accept-Language:en-us,en;q=0.5";
        $headers[]  = "Accept-Encoding:gzip,deflate";
        $headers[]  = "Accept-Charset:ISO-8859-1,utf-8;q=0.7,*;q=0.7";
        $headers[]  = "Keep-Alive:115";
        $headers[]  = "Connection:keep-alive";
        $headers[]  = "Cache-Control:max-age=0";
    
        $curl = curl_init();
        curl_setopt($curl, CURLOPT_URL, $url);
        curl_setopt($curl, CURLOPT_HTTPHEADER, $headers);
        curl_setopt($curl, CURLOPT_ENCODING, "gzip");
        curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1);
        $data = curl_exec($curl);
        curl_close($curl);
        return $data;
    
    }
    
    
    $data = curl('http://www.google.com');
    $regex = '#<title>(.*?)</title>#mis';
    preg_match($regex,$data,$match);
    var_dump($match); 
    echo $match[1];
    
    0 讨论(0)
  • 2021-01-01 05:09

    It just requires a user-agent ("any" really, any string suffices):

    file_get_contents("http://www.freshdirect.com",false,stream_context_create(
        array("http" => array("user_agent" => "any"))
    ));
    

    See more options.

    Of course, you can set user_agent in your ini:

     ini_set("user_agent","any");
     echo file_get_contents("http://www.freshdirect.com");
    

    ... but I prefer to be explicit for the next programmer working on it.

    0 讨论(0)
  • 2021-01-01 05:16

    Another option: Some hosts disable CURLOPT_FOLLOWLOCATION so recursive is what you want, also will log into a text file any errors. Also a simple example of how to use DOMDocument() to extract the content, obviously its not extensive but something you could build appon.

    <?php 
    function file_get_site($url){
    (function_exists('curl_init')) ? '' : die('cURL Must be installed. Ask your host to enable it or uncomment extension=php_curl.dll in php.ini');
    $curl = curl_init();
    $header[0] = "Accept: text/xml,application/xml,application/xhtml+xml,";
    $header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
    $header[] = "Cache-Control: max-age=0";
    $header[] = "Connection: keep-alive";
    $header[] = "Keep-Alive: 300";
    $header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7";
    $header[] = "Accept-Language: en-us,en;q=0.5";
    $header[] = "Pragma: ";
    
    curl_setopt($curl, CURLOPT_URL, $url);
    curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0 Firefox/5.0');
    curl_setopt($curl, CURLOPT_HTTPHEADER, $header);
    curl_setopt($curl, CURLOPT_HEADER, true);
    curl_setopt($curl, CURLOPT_REFERER, $url);
    curl_setopt($curl, CURLOPT_ENCODING, 'gzip,deflate');
    curl_setopt($curl, CURLOPT_AUTOREFERER, true);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($curl, CURLOPT_TIMEOUT, 60);
    
    $html = curl_exec($curl);
    
    $status = curl_getinfo($curl);
    curl_close($curl);
    
    if($status['http_code']!=200){
        if($status['http_code'] == 301 || $status['http_code'] == 302) {
            list($header) = explode("\r\n\r\n", $html, 2);
            $matches = array();
            preg_match("/(Location:|URI:)[^(\n)]*/", $header, $matches);
            $url = trim(str_replace($matches[1],"",$matches[0]));
            $url_parsed = parse_url($url);
            return (isset($url_parsed))? file_get_site($url):'';
        }
        $oline='';
        foreach($status as $key=>$eline){$oline.='['.$key.']'.$eline.' ';}
        $line =$oline." \r\n ".$url."\r\n-----------------\r\n";
        $handle = @fopen('./curl.error.log', 'a');
        fwrite($handle, $line);
        return FALSE;
    }
    return $html;
    }
    
    
    function get_content_tags($source,$tag,$id=null,$value=null){
        $xml = new DOMDocument();
        @$xml->loadHTML($source);
    
        foreach($xml->getElementsByTagName($tag) as $tags) {
            if($id!=null){
                if($tags->getAttribute($id)==$value){
                    return $tags->getAttribute('content');
                }
            }
            return $tags->nodeValue;
        }
    }
    
    
    $source = file_get_site('http://www.freshdirect.com/about/index.jsp');
    
    echo get_content_tags($source,'title'); //FreshDirect
    
    echo get_content_tags($source,'meta','name','description'); //Online grocer providing high quality fresh......
    
    ?>
    
    0 讨论(0)
提交回复
热议问题