How can I handle a file of 30'000 urls without memory leaks?

问题

Inside the .txt file there are 30.000 url to scrape, when I made the program I was testing it with 10 urls and everything was fine, as soon as I made the 30k url file .txt it crashes after few minutes, I think it starts to read the .txt file and then it crashes due memory issues, here is the console output and my code. What's the best way to handle such a file?

FATAL ERROR: Ineffective mark-compacts near heap limit Allocation failed - JavaScript heap out of memory 1: 0x100ba0c4a node::Abort() (.cold.1) [/usr/local/bin/node] 2: 0x100084961 node::FatalError(char const*, char const*) [/usr/local/bin/node] 3: 0x100084a89 node::OnFatalError(char const*, char const*) [/usr/local/bin/node] 4: 0x10017fa4d v8::Utils::ReportOOMFailure(v8::internal::Isolate*, char const*, bool) [/usr/local/bin/node] 5: 0x10017f9f7 v8::internal::V8::FatalProcessOutOfMemory(v8::internal::Isolate*, char const*, bool) [/usr/local/bin/node] 6: 0x100299baf v8::internal::Heap::FatalProcessOutOfMemory(char const*) [/usr/local/bin/node] 7: 0x10029af4c v8::internal::Heap::MarkCompactPrologue() [/usr/local/bin/node] 8: 0x100298b04 v8::internal::Heap::PerformGarbageCollection(v8::internal::GarbageCollector, v8::GCCallbackFlags) [/usr/local/bin/node] 9: 0x1002975ab v8::internal::Heap::CollectGarbage(v8::internal::AllocationSpace, v8::internal::GarbageCollectionReason, v8::GCCallbackFlags) [/usr/local/bin/node] 10: 0x100296a2a v8::internal::Heap::HandleGCRequest() [/usr/local/bin/node] 11: 0x10026d9a5 v8::internal::StackGuard::HandleInterrupts() [/usr/local/bin/node] 12: 0x1004e1383 v8::internal::Runtime_StackGuard(int, unsigned long*, v8::internal::Isolate*) [/usr/local/bin/node] 13: 0x1007502f9 Builtins_CEntry_Return1_DontSaveFPRegs_ArgvOnStack_NoBuiltinExit [/usr/local/bin/node] 14: 0x10073c5fb Builtins_StringPrototypeMatch [/usr/local/bin/node] 15: 0x267b75f209cb zsh: abort node scrape.js

let cheerio = require('cheerio');
let request = require('request');
let UserAgent = require('user-agents');
let axios = require('axios');
const fileUrlErrors = "UrlsWithErrors.txt";
const async = require('async')
let Promise = require("bluebird");

let userAgent = new UserAgent({ deviceCategory: 'desktop' });
let options = {
  headers: { userAgent }
};
let exec = require('child_process').exec;

const mysql = require('mysql2/promise');
let con = mysql.createPool({
      host: "xxx.xxx.xxx.xxx",
      user: "xxx",
      password: "xxxx",
      database: "xxx"
    });


async function run() {

    let file = fs.readFileSync('urls.txt');
    let urls = file.toString().split('\r\n');

    console.log(urls);

    const numeroUrl = urls.length;
    let urlsArray = [];
    console.log("numeroUrl : " + numeroUrl);
    for (let i = 1; i < numeroUrl; i++) {
      for (let y = 1; y < 6; y++) {
        urlsArray.push(urls[y-1] + '&page=' + y);
      }
    }
    Promise.map(urlsArray, parseUrl, {concurrency: 10}).then(function(data) {
        // all done here
        console.log("Done!!!");
    });
}


async function parseUrl(url) {
  try {
    let response = await axios.get(url, {
      headers: { 
        'User-Agent': new UserAgent() 
      }  
    });
    console.log(url + " " + response.status);
    if (response.status >= 201) {

      fs.appendFile(fileUrlErrors, '\n' + url + ' - ' + response.status, (error) => {
        if (error) {
            console.error(`Could not save the url status error to a file: ${error}`);
            return;
        }

        console.log('Saved Url error to ' + fileUrlErrors);
        });  
      
    } else if (response.status == 200) {

      let $ = cheerio.load(response.data);

      prodotti = $(".item");

      let items = $(prodotti).get();

      for (let item of items) {

        let title = $(".title", item).text();
        if (!title) {
          title = $(".title2", item).text();
        }
        
        let price = $(".price", item).text();
        if (!price) {
          price = $(".price2", item).text();
        }
        

        if (title) {
          const prodotto = [
          [
          title,
          price]
          ];
          let result = await con.query("INSERT INTO Items (title, price) VALUES ? ON DUPLICATE KEY UPDATE price=VALUES(price)", [prodotto]);
          console.log('Prodotto ' + title + ' inserito nel DB.');
          console.log(prodotto);
        }

        }
    } 

    } catch (error) {
        //console.error(error);
        if (error.response) {
          // Request made and server responded
            await fs.appendFile(fileUrlErrors, '\n' + url + " - " + error.response.status, (error) => {
            if (error) {
                console.error(`Could not save the url status error to a file: ${error}`);
                return;
            }
            console.log('Saved Url error to ' + fileUrlErrors);
            });  
        }

    }
}

run().then(() => {
    console.log("Done!");
}).catch(err => {
    console.log(err);
});

回答1:

As discussed in comments, your parseUrl() function has a mix of promises and plain asynchronous callbacks and that's a disaster. You really cannot mix them. The best solution is to do all your asynchronous control flow with promises and if you have some non-promise returning asynchronous callbacks, then promisify them either manually, with util.promisify(), by using the right promisified version of the API or by getting the right version of the library that contains promise support.

One you've converted everything to promise control flow, you can then use async and await and other promise control-flow tools and only then will your parseUrl() function return a promise that is only resolved when all the underlying asynchronous operations are done and only then will you have proper error propagation too.

Here's an example that fixes parseUrl() to properly use promises for all asynchronous operations and then uses await in your for loop to process your urls one at a time so you won't have all of them in=flight at the same time taking up too much memory:

const fs = require('fs');

async function run() {
    const file = fs.readFileSync('urls.txt');
    const urls = file.toString().split('\r\n');

    // count the number of urls inside .txt file
    const numberOfUrls = urls.length;
    console.log("There are : " + numberOfUrls + " urls");

    // Add page to url and use the scrape function
    for (let i = 1; i < numberOfUrls; i++) {
        for (let y = 1; y < 6; y++) {
            let url = urls[y - 1] + '&page=' + y;
            await parseUrl(url);
        }
    }

    async function parseUrl(url) {
        try {
            const response = await axios.get(url, {
                headers: {
                    'User-Agent': new UserAgent()
                }
            });
            if (response.status >= 201) {
                await fs.promises.appendFile(fileUrlErrors, '\n' + url + ' - ' + response.status);
            }  else if (response.status == 200) {
                const $ = cheerio.load(response.data);
                const prodotti = $(".result");

                // get items into a normal array so we can use a normal for loop
                const items = $(prodotti).get();
                for (let item of items) {
                    const title = $("title", item).text();
                    const code = $(".code", item).text();
                    if (asin[1]) {
                        const prodotto = [
                            [title, code]
                        ];
                        // promise support in your mysql database requires the mysql2 module
                        const result = await con.query("INSERT INTO Items (title, code) VALUES ? ON DUPLICATE KEY UPDATE code=VALUES(code)", [prodotto]);
                        console.log('Prodotto ' + code + ' inserito nel DB.');
                        console.log(prodotto);
                    }

                }
            }
        } catch (error) {
            console.error(error);
            throw error;          // propagate error back to caller
        }
    }
}

run().then(() => {
    console.log("all done");
}).catch(err => {
    console.log(err);
});

Note, the best place to get promise support for your mysql database (I'm guessing that's the database you're using) is in the mysql2 module on NPM. You will also probably have to change your other code that sets up the database connection to use promises (code which you don't show here).

Summary of Changes:

Switch to fs.promises.appendFile() to get promise support.
Change all variable declarations to let or const. No use of var.
Declare all variables properly so there are no accidental globals.
Switch to use mysql2 so you can use promise support for con.query()
Change if and if to if and else if since that's really what your logic is.
file.toString().split('\r\n') is already an array so there's no need to use Object.keys() on it to get the length. You can just directly reference the .length property on the array.

For Better Performance

The cause of your memory issue was that you were running ALL the requests in flight at the same time which uses a maximum amount of memory. The solution above goes all the way to the other side and runs ONE request at a time. This will use the minimum amount of memory, but it also sacrifices some end-to-end performance. For maximal performance, while using a bit more memory, you can often gain a lot by running N requests in flight at the same time where N depends upon your circumstances but can often be 3-10. This keeps the network busy fetching data while your CPU is busy working on previous responses.

To do that, you need some code that manages your asynchronous operations to run N at a time. One such function to do that is mapConcurrent() and is illustrated here. It iterates your array for you, firing up N requests and then each time one of those N finishes, it fires off the next one, gathering results in order into a final array for you.

回答2:

Code updated

const fs = require('fs');
let cheerio = require('cheerio');
let request = require('request');
let UserAgent = require('user-agents');
let axios = require('axios');
const fileUrlErrors = "UrlsWithErrors.txt";
const async = require('async')
let Promise = require("bluebird");

let userAgent = new UserAgent({ deviceCategory: 'desktop' });
let options = {
  headers: { userAgent }
};
let exec = require('child_process').exec;

const mysql = require('mysql2/promise');
let con = mysql.createPool({
      host: "xxx.xxx.xxx.xxx",
      user: "xxx",
      password: "xxxx",
      database: "xxx"
    });

async function run() {

    let file = fs.readFileSync('urls.txt');
    let urls = file.toString().split('\r\n');

    console.log(urls);

    const numeroUrl = urls.length;
    let urlsArray = [];
    console.log("numeroUrl : " + numeroUrl);
    for (let i = 1; i < numeroUrl; i++) {
      for (let y = 1; y < 6; y++) {
        urlsArray.push(urls[i-1] + '&page=' + y);
      }
    }
    Promise.map(urlsArray, parseUrl, {concurrency: 10}).then(function(data) {
        // all done here
        console.log("Done!");

    });
}


async function parseUrl(url) {
  try {
    let response = await axios.get(url, {
      headers: { 
        'User-Agent': new UserAgent() 
      }  
    });
    console.log(url + " " + response.status);
    if (response.status >= 201) {

      await fs.promises.appendFile(fileUrlErrors, '\n' + url + ' - ' + response.status);
        console.log('Saved Url error to ' + fileUrlErrors); 
      
    } else if (response.status == 200) {

      let $ = cheerio.load(response.data);

      prodotti = $(".item");

      let items = $(prodotti).get();

      for (let item of items) {

        let title = $(".title", item).text();
        if (!title) {
          title = $(".title2", item).text();
        }
        
        let price = $(".price", item).text();
        if (!price) {
          price = $(".price2", item).text();
        }
        

        if (title) {
          const prodotto = [
          [
          title,
          price]
          ];
          let result = await con.query("INSERT INTO Items (title, price) VALUES ? ON DUPLICATE KEY UPDATE price=VALUES(price)", [prodotto]);
          console.log('Prodotto ' + title + ' inserito nel DB.');
          console.log(prodotto);
        }

        }
    } 

    } catch (error) {
        //console.error(error);
        if (error.response) {
          // Request made and server responded
            await fs.promises.appendFile(fileUrlErrors, '\n' + url + " - " + error.response.status);
            console.log('Saved Url error to ' + fileUrlErrors);
        }

    }
}

run().then(() => {
    console.log("Done!");
}).catch(err => {
    console.log(err);
});

来源：https://stackoverflow.com/questions/62685802/how-can-i-handle-a-file-of-30000-urls-without-memory-leaks

标签

javascript

node.js

performance