问题
I'm using the following script to scrape images using phantom.js:
var page = require('webpage').create();
url = 'https://www.everlane.com/collections/mens-luxury-tees/products/mens-crew-antique'
page.open(url, function(status) {
if (status !== 'success') {
console.log('error');
phantom.exit();
return;
}
var a = page.evaluate(function() {
return document.getElementsByTagName('img');
});
SrcAlt = [];
for (var i=0; i<a.length; i++){
var src = a[i].getAttribute('src');
var alt = a[i].getAttribute('alt');
SrcAlt.push({"src": src, "alt": alt});
}
console.log(SrcAlt);
phantom.exit();
});
But, the script hangs after I define the variable a
, meaning it doesn't iterate through and return the src and alt attributes. When I console.log(a)
it returns 34, so I believe I have the proper image resources. How can I access the src and alt information? Thanks!
回答1:
You need to evaluate your page after the page has finished loading. You can do this by using the page.onLoadFinished
callback. This callback is called after all page content is finished loading and the document is ready. Something like this should work:
var page = require('webpage').create();
var url = 'https://www.everlane.com/collections/mens-luxury-tees/products/mens-crew-antique';
page.open(url);
page.onLoadFinished = function()
{
var a = page.evaluate(function() {
return document.getElementsByTagName('img');
});
SrcAlt = [];
for (var i=0; i<a.length; i++){
var src = a[i].getAttribute('src');
var alt = a[i].getAttribute('alt');
SrcAlt.push({"src": src, "alt": alt});
}
console.log(SrcAlt);
phantom.exit();
}
来源:https://stackoverflow.com/questions/19063094/how-to-scrape-javascript-injected-image-src-and-alt-with-phantom-js