Using casperjs and phantomjs to scrape multiple pages

时光怂恿深爱的人放手 提交于 2019-12-22 01:41:01

问题


I'm trying to scrape a number of pages that have a standard format. I've been able to use Phantomjs to successfully scrape a single page, but when I try to iterate over multiple ones, the asynchronous processing makes things hang up. What's the proper way to tell Casper/Phantom to wait?


var page = require('webpage').create();
var fs = require('fs');

page.onConsoleMessage = function(msg) {
    phantom.outputEncoding = "utf-8";
    console.log(msg);
};


// this overwrites the previous output file

f = fs.open("lat_long.txt", "w");
f.write("--");
f.close();


   // this is the unique identifier for the locations. For now, I just have three datapoints
  var EPAID = ["KYD980501076","ME8170022018", "MEN000103584"]; 

 /// this code will be used to loop through the different locations. For now, set to look at only one.  
 for (q= 0;  q < 1; q++)  {
    var processing = false;



   //we construct the target url
   var url  = "http://iaspub.epa.gov/enviro/efsystemquery.cerclis?fac_search=site_epa_id&fac_value=" + EPAID[0]  + "&fac_search_type=Beginning+With&postal_code=&location_address=&add_search_type=Beginning+With&city_name=&county_name=&state_code=&program_search=1&report=2&page_no=1&output_sql_switch=TRUE&database_type=CERCLIS" ;


   page.open(url);
   page.onLoadFinished = function(status) {
   if ( status === "success" ) {
       page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
           var str = page.evaluate(function() {                   
               $value = [];
               $Object = $(".result tr");                
               for (i =0 ; i < 10; i++) { 
             $value.push($Object.find('td').html(),$Object.find('td').next().next().html() );          
             $Object = $Object.next();
            } 

            $string = "{ EPAID: "+  $value[0] +  ", " + 
                     "Name: "+  $value[1] +  ", " +                
                     "City: "+  $value[4] +  ", " +
                     "State: "+  $value[6] +  ", " +
                     "ZipCode: "+  $value[8] +  ", " +  
                     "Latitude: "+  $value[14] +  ", " +
                     "Longitude: "+  $value[16] +  " }" ;          
            return $string;
        });

        f = fs.open("lat_long.txt", "a");
        f.write(str);
        f.close();
        processing = true;
        console.log("writing to file");
       phantom.exit();    

    });
 }


 // right here it should delay until the previous page is completed        
 //  while (!processing)  {    
 //       setTimeout(function(){ console.log("waiting....");},1000);
 //    }


};

}

console.log("finished all pages");

回答1:


If you switched to using casperJS, it is as simple as changing your page.open() into page.thenOpen(). (This CasperJS - How to open up all links in an array of links question looks very similar to yours?)

If you wanted to stick with PhantomJS you need to start the next page load in the onSuccess callback of the previous load. This is tedious, and needs care to avoid large memory usage. (I did it once or twice, but now simply use CasperJS.)

An alternative approach is to create the page object inside the loop. However that is not quite answering your question, as then they will run in parallel. But you could use setTimeout to stagger each once to avoid a burst of activity if you have hundreds of URLs!




回答2:


Here is the code that ultimately works (using the timeout approach since I wasn't able to get the success callback to work better).

With casperjs installed, I named this file "process.js" and was able to run it from the command line as "casperjs process.js"


var page = require('webpage').create();
var fs = require('fs');

page.onConsoleMessage = function(msg) {
    phantom.outputEncoding = "utf-8";
    console.log(msg);
};


// this overwrites the previous output f
 // this is the unique identifier for the locations. 
    var EPAID = ["NED981713837",... , "FLD049985302", "NJD986643153"]; 


f = fs.open("lat_long.txt", "w");
f.write("-<>-");
f.close();


var count = 0;
var target = 1400;
var written = [];

function yourFunction(){

   if (count < target) {

      process(count);
      count++;
      setTimeout(yourFunction, 5000);

   } else {
       console.log("exiting");
       phantom.exit();    
       return;
   }    
}




function process(counter){    

    var processing = false;

         console.log("Beginning record #" + counter); 

    //we construct the target url
    var url  = "http://iaspub.epa.gov/enviro/efsystemquery.cerclis?fac_search=site_epa_id&fac_value=" + EPAID[counter]  + "&fac_search_type=Beginning+With&postal_code=&location_address=&add_search_type=Beginning+With&city_name=&county_name=&state_code=&program_search=1&report=2&page_no=1&output_sql_switch=TRUE&database_type=CERCLIS" ;


    page.open(url);
    page.onLoadFinished = function(status) {
    if ( status === "success" ) {
        page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
            var str = page.evaluate(function() {                   
                $value = [];
                $Object = $(".result tr");                
              for (i =0 ; i < 10; i++) { 
                 $value.push($Object.find('td').html(),$Object.find('td').next().next().html() );          
                 $Object = $Object.next();
              } 

                $string = "{ \"EPAID\": \""+  $value[0] +  "\", " + 
                         "\"Name\": \""+  $value[1] +  "\", " +                
                         "\"City\": \""+  $value[4] +  "\", " +
                         "\"State\": \""+  $value[6] +  "\", " +
                         "\"ZipCode\": \""+  $value[8] +  "\", " +  
                         "\"Latitude\": "+  $value[14] +  ", " +
                         "\"Longitude\": "+  $value[16] +  " }," ;          
                return $string;
            });


           if (written[counter] === undefined) { 

             f = fs.open("lat_long.txt", "a");
             f.write(str);
             f.close();
             written[counter] = true;
             console.log("Writing to file #"+  counter);
           }  

        });
    }

    };
}

 console.log("Start...");

yourFunction();


来源:https://stackoverflow.com/questions/21288883/using-casperjs-and-phantomjs-to-scrape-multiple-pages

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!