I\'m using puppeteer for scraping some pages, but I\'m curious about how to manage this in production for a node app. I\'ll be scraping up to 500,000 pages in a day, but the
Other performance related articles are,
This is another example using puppeteer and generic-pool libraries.
const puppeteer = require('puppeteer');
const genericPool = require("generic-pool");
async function createChromePool() {
const factory = {
create: function() {
//open an instance of the Chrome headless browser - Heroku buildpack requires these args
return puppeteer.launch({ args: ['--no-sandbox', '--disable-setuid-sandbox', '--ignore-certificate-errors'] });
},
destroy: function(client) {
//close the browser
client.close();
}
};
const opts = { max: 1, acquireTimeoutMillis: 120000, priorityRange: 3};
global.chromepool = genericPool.createPool(factory, opts);
global.chromepool.on('factoryCreateError', function(err){
debug(err);
});
global.chromepool.on('factoryDestroyError', function(err){
debug(err);
});
}
async function destroyChromePool() {
// Only call this once in your application -- at the point you want to shutdown and stop using this pool.
global.chromepool.drain().then(function() {
global.chromepool.clear();
});
}