is it possible to write web crawler in javascript?

后端未结

关注

 11  579

深忆病人 2021-02-01 07:48

I want to crawl the page and check for the hyperlinks in that respective page and also follow those hyperlinks and capture data from the page

11条回答

闹比i (楼主)

2021-02-01 08:29

I made an example javascript crawler on github.

It's event driven and use an in-memory queue to store all the resources(ie. urls).

How to use in your node environment

var Crawler = require('../lib/crawler')
var crawler = new Crawler('http://www.someUrl.com');

// crawler.maxDepth = 4;
// crawler.crawlInterval = 10;
// crawler.maxListenerCurrency = 10;
// crawler.redisQueue = true;
crawler.start();

Here I'm just showing you 2 core method of a javascript crawler.

Crawler.prototype.run = function() {
  var crawler = this;
  process.nextTick(() => {
    //the run loop
    crawler.crawlerIntervalId = setInterval(() => {

      crawler.crawl();

    }, crawler.crawlInterval);
    //kick off first one
    crawler.crawl();
  });

  crawler.running = true;
  crawler.emit('start');
}


Crawler.prototype.crawl = function() {
  var crawler = this;

  if (crawler._openRequests >= crawler.maxListenerCurrency) return;


  //go get the item
  crawler.queue.oldestUnfetchedItem((err, queueItem, index) => {
    if (queueItem) {
      //got the item start the fetch
      crawler.fetchQueueItem(queueItem, index);
    } else if (crawler._openRequests === 0) {
      crawler.queue.complete((err, completeCount) => {
        if (err)
          throw err;
        crawler.queue.getLength((err, length) => {
          if (err)
            throw err;
          if (length === completeCount) {
            //no open Request, no unfetcheditem stop the crawler
            crawler.emit("complete", completeCount);
            clearInterval(crawler.crawlerIntervalId);
            crawler.running = false;
          }
        });
      });
    }

  });
};

Here is the github link https://github.com/bfwg/node-tinycrawler. It is a javascript web crawler written under 1000 lines of code. This should put you on the right track.

0 讨论(0)

查看其它11个回答