Incremental and non-incremental urls in node js with cheerio and request

大城市里の小女人 提交于 2019-12-03 17:09:29
xmojmr

You can find out how to do it by studying existing famous website copiers (closed source or open source)

For example - use trial copy of http://www.tenmax.com/teleport/pro/home.htm to scrap your pages and then try the same with http://www.httrack.com and you should get the idea how they did it (and how you can do it) quite clearly.

The key programming concepts are lookup cache and task queue

Recursion is not the successful concept here if your solution should scale well up to several node.js worker processes and up to many pages

EDIT: after clarifying comments

Before you start reworking your scrapping engine into more scale-able architecture, as a new Node.js developer you can start simply with synchronized alternative to the Node.js callback hell as provided by the wait.for package created by @lucio-m-tato.

The code below worked for me with the links you provided

var request = require('request');
var cheerio = require('cheerio');
var wait = require("wait.for");

function requestWaitForWrapper(url, callback) {
  request(url, function(error, response, html) {
    if (error)
      callback(error, response);
    else if (response.statusCode == 200)
      callback(null, html);
    else
      callback(new Error("Status not 200 OK"), response);
  });
}

function readBookInfo(baseUrl, s) {
  var html = wait.for(requestWaitForWrapper, baseUrl + '&s=' + s.toString());
  var $ = cheerio.load(html, {
    xmlMode: true
  });

  return {
    s: s,
    id: $('work').attr('id'),
    total: parseInt($('records').attr('total'))
  };
}

function readWorkInfo(id) {
  var html = wait.for(requestWaitForWrapper, 'http://api.trove.nla.gov.au/work/' + id.toString() + '?key=6k6oagt6ott4ohno&reclevel=full');
  var $ = cheerio.load(html, {
    xmlMode: true
  });

  return {
    title: $('title').text(),
    contributor: $('contributor').text()
  }
}

function main() {
  var baseBookUrl = 'http://api.trove.nla.gov.au/result?key=6k6oagt6ott4ohno&zone=book&l-advformat=Thesis&sortby=dateDesc&q=+date%3A[2000+TO+2014]&l-availability=y&l-australian=y&n=1';
  var baseInfo = readBookInfo(baseBookUrl, 0);

  for (var s = 0; s < baseInfo.total; s++) {
    var bookInfo = readBookInfo(baseBookUrl, s);
    var workInfo = readWorkInfo(bookInfo.id);
    console.log(bookInfo.id + ";" + workInfo.contributor + ";" + workInfo.title);
  }
}

wait.launchFiber(main);

You could use the additional async module to handle multiple request and iteration through several pages. Read more about async here https://github.com/caolan/async.

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!