I am tring to scrape a few sites. Here is my code:
for (var i = 0; i < urls.length; i++) {
url = urls[i];
console.log(\"Start scraping: \" + url);
PhantomJS is asynchronous. By calling page.open()
multiple times using a loop, you essentially rush the execution of the callback. You're overwriting the current request before it is finished with a new request which is then again overwritten. You need to execute them one after the other, for example like this:
page.open(url, function () {
waitFor(function() {
// something
}, function() {
page.open(url, function () {
waitFor(function() {
// something
}, function() {
// and so on
});
});
});
});
But this is tedious. There are utilities that can help you with writing nicer code like async.js. You can install it in the directory of the phantomjs script through npm.
var async = require("async"); // install async through npm
var tests = urls.map(function(url){
return function(callback){
page.open(url, function () {
waitFor(function() {
// something
}, function() {
callback();
});
});
};
});
async.series(tests, function finish(){
fs.write('test.txt', output);
phantom.exit();
});
If you don't want any dependencies, then it is also easy to define your own recursive function (from here):
var urls = [/*....*/];
function handle_page(url){
page.open(url, function(){
waitFor(function() {
// something
}, function() {
next_page();
});
});
}
function next_page(){
var url = urls.shift();
if(!urls){
phantom.exit(0);
}
handle_page(url);
}
next_page();