how to scrape links with phantomjs

前端 未结 3 553
囚心锁ツ
囚心锁ツ 2020-12-24 08:58

Can PhantomJS be used an an alternative to BeautifulSoup?

I am trying to search on Etsy and visit all the links in term. In Python, I know how to do this (with Beau

相关标签:
3条回答
  • 2020-12-24 09:05

    Here is some code I recently wrote that scrapes urls using PhantomJs, if you provide only a URL it will display all URLS's on the page, if you supply an argument of class|id followed by a "class/id name" it will display the urls of the class/id only.

    ////////////////////////////////////////////////////////// 
    /////  PhantomJS URL Scraper v.1.3 ///// 
    // 
    // Copyrighted by +A.M.Danischewski  2016+ (c)
    // This program may be reutilized without limits, provided this 
    // notice remain intact. 
    // 
    // Usage: phantomjs phantom_urls.js <URL> [["class"|"id"] [<query id/class name>]]
    //
    //   Argument 1: URL -- "https://www.youtube.com/watch?v=8TniRMwL2Vg" 
    //   Argument 2: "class" or "id" 
    //   Argument 3: If Argument 2 was provided, "class name" or "id name" 
    // 
    // By default this program will display ALL urls from a user supplied URL.  
    // If a class name or id name is provided then only URL's from the class 
    // or id are displayed.  
    //  
    /////////////////////////////////// 
    
    var page = require('webpage').create(), 
        system = require('system'),
        address;
    
    if (system.args.length === 1) {
      console.log(' Usage: phantomjs phantom_urls.js <URL> [["class"|"id"] [<query id/class name>]]');
      phantom.exit();
    }
    
    address = system.args[1];
    querytype= system.args[2];
    queryclass = system.args[3];
    page.open(address, function(status) {
      if (status !== 'success') {
        console.log('Error loading address: '+address);
      } else {
       //console.log('Success! In loading address: '+address);   
      }
    });
    
    page.onConsoleMessage = function(msg) {
      console.log(msg);
    }
    
    page.onLoadFinished = function(status) {
       var dynclass="function() { window.class_urls = new Array(); window.class_urls_next=0; var listings = document.getElementsByClassName('"+queryclass+"'); for (var i=0; i < listings.length; i++) { var el = listings[i]; var ellnks=[].map.call(el.querySelectorAll('a'),function(link) {return link.getAttribute('href');}); var elhtml=el.innerHTML; window.class_urls.push(ellnks.join('\\n')); }; return window.class_urls;}"; 
       var    dynid="function() { window.id_urls = new Array(); window.id_urls_next=0; var listings = document.getElementById('"+queryclass+"'); var ellnks=[].map.call(listings.querySelectorAll('a'),function(link) {return link.getAttribute('href');}); var elhtml=listings.innerHTML; window.id_urls.push(ellnks.join('\\n'));  return window.id_urls;}";  
       var  allurls="function() { var links = page.evaluate(function() { return [].map.call(document.querySelectorAll('a'), function(link) { return link.getAttribute('href'); };); };); console.log(links.join('\\n')); }"; 
       var page_eval_function="";  
       if (querytype === "class") {
       console.log(page.evaluate(dynclass).toString().replace(/,/g, "\n")); 
       } else if (querytype === "id") {
       console.log(page.evaluate(dynid).toString().replace(/,/g, "\n")); 
       } else { 
       var links = page.evaluate(function() {
            return [].map.call(document.querySelectorAll('a'), function(link) {
                return link.getAttribute('href');
            });
        });    
           console.log(links.join('\n'));
       }             
       phantom.exit();
    };
    
    0 讨论(0)
  • 2020-12-24 09:23

    PhantomJS evaluate() cannot serialize and return complex objects like HTMLElements or NodeLists, so you have to map them to serializable things before:

    var page = require('webpage').create();
    var url = 'http://www.etsy.com/search?q=hello%20kitty';
    
    page.open(url, function(status) {
        // list all the a.href links in the hello kitty etsy page
        var links = page.evaluate(function() {
            return [].map.call(document.querySelectorAll('a.listing-thumb'), function(link) {
                return link.getAttribute('href');
            });
        });
        console.log(links.join('\n'));
        phantom.exit();
    });
    

    Note: here we use [].map.call() in order to treat a NodeList as a standard Array.

    0 讨论(0)
  • 2020-12-24 09:24

    The only problem with your code is that you do not understand phantomjs scopes. You have phantom and page scopes. You tried to return JavaScript DOM object references (those can't be serialized) from page scope (page.evaluate runs in page scope) to phantom main scope. I think that is not possible. Here follows code that works:

    var page = require('webpage').create();
    var url = 'http://www.etsy.com/search?q=hello%20kitty';
    
    // for debug (to see if page returns status code 200)
    page.onResourceReceived = function(response) {
        if (response.url === url) {
            console.log('Resorce: "' + response.url + '" status: '  + response.status);
    
            if (response.status === 200) {
                console.log(response.url);
                for (var i = 0; i < response.headers.length; i++) {
                    console.log(response.headers[i].name + ': ' + response.headers[i].value);
                }
            }
        }
    };
    
    page.onLoadFinished = function(status){
        console.log('Status: ' + status);
    
        console.log('Starting evaluate...');
        var links = page.evaluate(function() {
            var nodes = [],
                matches = document.querySelectorAll("a.listing-thumb");
    
                for(var i = 0; i < matches.length; ++i) {
                    nodes.push(matches[i].href);
                }
    
                return nodes;
        });
        console.log('Done evaluate... count: ' + links.length);
    
        if (links && links.length > 0) {
            for(var i = 0; i < links.length; ++i) {
                console.log('(' + i + ') ' + links[i]);
            }
        } else {
            console.log("No match found!");
        }
    
        phantom.exit(0);
    };
    
    page.open(url);
    
    0 讨论(0)
提交回复
热议问题