How to scrape javascript injected image src and alt with phantom.js?

£可爱£侵袭症+ 提交于 2019-12-14 03:12:31

问题


I'm using the following script to scrape images using phantom.js:

var page = require('webpage').create();
url = 'https://www.everlane.com/collections/mens-luxury-tees/products/mens-crew-antique'

page.open(url, function(status) {

if (status !== 'success') {
    console.log('error');
    phantom.exit();
    return;
}

var a = page.evaluate(function() {
        return document.getElementsByTagName('img');
    });

SrcAlt = [];
for (var i=0; i<a.length; i++){
    var src = a[i].getAttribute('src');
    var alt = a[i].getAttribute('alt');
    SrcAlt.push({"src": src, "alt": alt});
}
console.log(SrcAlt);
phantom.exit();
});

But, the script hangs after I define the variable a, meaning it doesn't iterate through and return the src and alt attributes. When I console.log(a) it returns 34, so I believe I have the proper image resources. How can I access the src and alt information? Thanks!


回答1:


You need to evaluate your page after the page has finished loading. You can do this by using the page.onLoadFinished callback. This callback is called after all page content is finished loading and the document is ready. Something like this should work:

var page = require('webpage').create();
var url = 'https://www.everlane.com/collections/mens-luxury-tees/products/mens-crew-antique';

page.open(url);

page.onLoadFinished = function()
{
    var a = page.evaluate(function() {
        return document.getElementsByTagName('img');
    });

    SrcAlt = [];
    for (var i=0; i<a.length; i++){
        var src = a[i].getAttribute('src');
        var alt = a[i].getAttribute('alt');
        SrcAlt.push({"src": src, "alt": alt});
    }

    console.log(SrcAlt);
    phantom.exit();
}


来源:https://stackoverflow.com/questions/19063094/how-to-scrape-javascript-injected-image-src-and-alt-with-phantom-js

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!