Looping over urls to do the same thing

前端 未结 1 812
遇见更好的自我
遇见更好的自我 2020-11-28 16:48

I am tring to scrape a few sites. Here is my code:

for (var i = 0; i < urls.length; i++) {
    url = urls[i];
    console.log(\"Start scraping: \" + url);         


        
相关标签:
1条回答
  • 2020-11-28 17:36

    PhantomJS is asynchronous. By calling page.open() multiple times using a loop, you essentially rush the execution of the callback. You're overwriting the current request before it is finished with a new request which is then again overwritten. You need to execute them one after the other, for example like this:

    page.open(url, function () {
        waitFor(function() {
           // something
        }, function() {
            page.open(url, function () {
                waitFor(function() {
                   // something
                }, function() {
                    // and so on
                });
            });
        });
    });
    

    But this is tedious. There are utilities that can help you with writing nicer code like async.js. You can install it in the directory of the phantomjs script through npm.

    var async = require("async"); // install async through npm
    var tests = urls.map(function(url){
        return function(callback){
            page.open(url, function () {
                waitFor(function() {
                   // something
                }, function() {
                    callback();
                });
            });
        };
    });
    async.series(tests, function finish(){
        fs.write('test.txt', output);
        phantom.exit();
    });
    

    If you don't want any dependencies, then it is also easy to define your own recursive function (from here):

    var urls = [/*....*/];
    
    function handle_page(url){
        page.open(url, function(){
            waitFor(function() {
               // something
            }, function() {
                next_page();
            });
        });
    }
    
    function next_page(){
        var url = urls.shift();
        if(!urls){
            phantom.exit(0);
        }
        handle_page(url);
    }
    
    next_page();
    
    0 讨论(0)
提交回复
热议问题