CasperJS - Scraper not navigating to the next page

最后都变了- 提交于 2019-12-25 03:59:16

问题


The following code is a simple scraper written in CasperJS.

var casper = require('casper').create();

var url = casper.cli.get(0);
var page1 = casper.cli.get(1);
var page2 = casper.cli.get(2);
//console.log(page2);
var proxy = casper.cli.get(3);

//alert(page1);

var exp = /[-a-zA-Z0-9@:%_\+.~#?&//=]{2,256}\.[a-z]{2,4}\b(\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?/gi;
var regex = new RegExp(exp);

var baseUrl = url;

//console.log(baseUrl);

var nextBtn = "a.navigation-button.next";

var allLinks = [];

casper.start(baseUrl);

casper.waitForSelector(nextBtn, processPage);

casper.run();

function processPage() {
  for (var i = page1; i < page2; i = i + 1) {
      console.log(i);
    var pageData = this.evaluate(getPageData);
    allLinks = allLinks.concat(pageData);



  if (!this.exists(nextBtn)) {
    return;
  };

  this.thenClick(nextBtn).then(function() {
      //this.echo(i);
    this.echo(this.getCurrentUrl());
    //this.wait(1000);
  });
};
}

function getPageData(){
  //return document.title;

  var links = document.getElementsByClassName('pro-title');
  links = Array.prototype.map.call(links,function(link){
    return link.getAttribute('href');
  });
  return links;
};


casper.then(function(){
  //require('utils').dump(allLinks);
  this.each(allLinks,function(self,link){
      if (link.match(regex)) {
    self.thenOpen(link,function(a){
      jsonObj = {};
      jsonObj.title = this.fetchText('a.profile-full-name');

      jsonObj.services = this.getHTML('div.info-list-text span:nth-child(2) span');
      jsonObj.services = jsonObj.services.replace(/&amp;/g,"and");  

      jsonObj.location = this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span');
      //jsonObj.contact = this.fetchText('span.pro-contact-text');
      jsonObj.description = this.getHTML('div.profile-about div:nth-child(1)');  
      //jsonObj.description.replace(/\s/g, '');   

      //require('utils').dump(jsonObj);
      //jsonObj.description = jsonObj.description.replace(/[\t\n]/g,"");   

      //jsonObj = JSON.stringify(jsonObj, null, '\t');
      //console.log(i);
      require('utils').dump(jsonObj);
    });
      };
  });
});

I am executing this script as follows,

casperjs scraping.js http://www.houzz.com/professionals/c/Chicago--IL/p/15 1 3

The first CLI argument is the starting URL. The second and third arguments are the starting and ending page numbers of the scrape.

I am able to extract data from the first page, but I don't understand why I am not able to extract data from any of the consequent pages.


回答1:


You cannot mix synchronous and asynchronous code like this in processPage. The loop is immediately executed, but the click and the loading of the next page happens asynchronously. The evaluation of the page has to be done asynchronously:

function processPage() {
    for (var i = page1; i < page2; i = i + 1) {
        this.then(function(){
            console.log(i);
            var pageData = this.evaluate(getPageData);
            allLinks = allLinks.concat(pageData);

            if (!this.exists(nextBtn)) {
                return;
            }

            this.thenClick(nextBtn).then(function() {
                this.echo(this.getCurrentUrl());
            });
        });
    };
}


来源:https://stackoverflow.com/questions/35418445/casperjs-scraper-not-navigating-to-the-next-page

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!