php - Fastest way to check presence of text in many domains (above 1000)

后端 未结 1 1192
一生所求
一生所求 2020-12-04 01:42

I have a php script running and using cURL to retrieve the content of webpages on which I would like to check for the presence of some text.

Right now it looks like

相关标签:
1条回答
  • 2020-12-04 01:53

    You can use curl_multi_init .... which Allows the processing of multiple cURL handles in parallel.

    Example

    $url = array();
    $url[] = 'http://www.huffingtonpost.com';
    $url[] = 'http://www.yahoo.com';
    $url[] = 'http://www.google.com';
    $url[] = 'http://technet.microsoft.com/en-us/';
    
    $start = microtime(true);
    echo "<pre>";
    print_r(checkLinks($url, "Azure"));
    echo "<h1>", microtime(true) - $start, "</h1>";
    

    Output

    Array
    (
        [0] => http://technet.microsoft.com/en-us/
    )
    
    1.2735739707947 <-- Faster
    

    Function Used

    function checkLinks($nodes, $text) {
        $mh = curl_multi_init();
        $curl_array = array();
        foreach ( $nodes as $i => $url ) {
            $curl_array[$i] = curl_init($url);
            curl_setopt($curl_array[$i], CURLOPT_RETURNTRANSFER, true);
            curl_setopt($curl_array[$i], CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 (.NET CLR 3.5.30729)');
            curl_setopt($curl_array[$i], CURLOPT_CONNECTTIMEOUT, 5);
            curl_setopt($curl_array[$i], CURLOPT_TIMEOUT, 15);
            curl_multi_add_handle($mh, $curl_array[$i]);
        }
        $running = NULL;
        do {
            usleep(10000);
            curl_multi_exec($mh, $running);
        } while ( $running > 0 );
        $res = array();
        foreach ( $nodes as $i => $url ) {
            $curlErrorCode = curl_errno($curl_array[$i]);
            if ($curlErrorCode === 0) {
                $info = curl_getinfo($curl_array[$i]);
                if ($info['http_code'] == 200) {
                    if (stripos(curl_multi_getcontent($curl_array[$i]), $text) !== false) {
                        $res[] = $info['url'];
                    }
                }
            }
            curl_multi_remove_handle($mh, $curl_array[$i]);
            curl_close($curl_array[$i]);
        }
        curl_multi_close($mh);
        return $res;
    }
    
    0 讨论(0)
提交回复
热议问题