is it possible to write web crawler in javascript?

后端 未结 11 579
深忆病人
深忆病人 2021-02-01 07:48

I want to crawl the page and check for the hyperlinks in that respective page and also follow those hyperlinks and capture data from the page

11条回答
  •  闹比i
    闹比i (楼主)
    2021-02-01 08:29

    I made an example javascript crawler on github.

    It's event driven and use an in-memory queue to store all the resources(ie. urls).

    How to use in your node environment

    var Crawler = require('../lib/crawler')
    var crawler = new Crawler('http://www.someUrl.com');
    
    // crawler.maxDepth = 4;
    // crawler.crawlInterval = 10;
    // crawler.maxListenerCurrency = 10;
    // crawler.redisQueue = true;
    crawler.start();
    

    Here I'm just showing you 2 core method of a javascript crawler.

    Crawler.prototype.run = function() {
      var crawler = this;
      process.nextTick(() => {
        //the run loop
        crawler.crawlerIntervalId = setInterval(() => {
    
          crawler.crawl();
    
        }, crawler.crawlInterval);
        //kick off first one
        crawler.crawl();
      });
    
      crawler.running = true;
      crawler.emit('start');
    }
    
    
    Crawler.prototype.crawl = function() {
      var crawler = this;
    
      if (crawler._openRequests >= crawler.maxListenerCurrency) return;
    
    
      //go get the item
      crawler.queue.oldestUnfetchedItem((err, queueItem, index) => {
        if (queueItem) {
          //got the item start the fetch
          crawler.fetchQueueItem(queueItem, index);
        } else if (crawler._openRequests === 0) {
          crawler.queue.complete((err, completeCount) => {
            if (err)
              throw err;
            crawler.queue.getLength((err, length) => {
              if (err)
                throw err;
              if (length === completeCount) {
                //no open Request, no unfetcheditem stop the crawler
                crawler.emit("complete", completeCount);
                clearInterval(crawler.crawlerIntervalId);
                crawler.running = false;
              }
            });
          });
        }
    
      });
    };
    

    Here is the github link https://github.com/bfwg/node-tinycrawler. It is a javascript web crawler written under 1000 lines of code. This should put you on the right track.

提交回复
热议问题