Node fs.readdir freezing in folders with too many files

狂风中的少年 提交于 2020-01-23 17:28:47

问题


In Node.js I have to read files in a folder and for each file get file handler info, this is my simplest implementation using fs.readdir:

FileServer.prototype.listLocal = function (params) {
            var self = this;
            var options = {
                limit: 100,
                desc: 1
            };
            // override defaults
            for (var attrname in params) { options[attrname] = params[attrname]; }

            // media path is the media folder
            var mediaDir = path.join(self._options.mediaDir, path.sep);
            return new Promise((resolve, reject) => {
                fs.readdir(mediaDir, (error, results) => {
                    if (error) {
                        self.logger.error("FileServer.list error:%s", error);
                        return reject(error);
                    } else { // list files
                        // cut to max files
                        results = results.slice(0, options.limit);
                        // filter default ext
                        results = results.filter(item => {
                            return (item.indexOf('.mp3') > -1);
                        });
                        // format meta data
                        results = results.map(file => {
                            var filePath = path.join(self._options.mediaDir, path.sep, file);
                            var item = {
                                name: file,
                                path: filePath
                            };
                            const fd = fs.openSync(filePath, 'r');
                            var fstat = fs.fstatSync(fd);
                            // file size in bytes
                            item.size = fstat.size;
                            item.sizehr = self.formatSizeUnits(fstat.size);
                            // "Birth Time" Time of file creation. Set once when the file is created. 
                            item.birthtime = fstat.birthtime;
                            // "Modified Time" Time when file data last modified.
                            item.mtime = fstat.mtime;
                            // "Access Time" Time when file data last accessed.
                            item.atime = fstat.atime;
                            item.timestamp = new Date(item.mtime).getTime();
                            item.media_id = path.basename(filePath, '.mp3');

                            fs.closeSync(fd);//close file
                            return item;
                        });
                        if (options.desc) { // sort by most recent
                            results.sort(function (a, b) {
                                return b.timestamp - a.timestamp;
                            });
                        } else { // sort by older
                            results.sort(function (a, b) {
                                return a.timestamp - b.timestamp;
                            });
                        }
                        return resolve(results);
                    }
                })
            });
        }

so that for each file I get an array of items

{
  "name": "sample121.mp3",
  "path": "/data/sample121.mp3",
  "size": 5751405,
  "sizehr": "5.4850 MB",
  "birthtime": "2018-10-08T15:26:08.397Z",
  "mtime": "2018-10-08T15:26:11.650Z",
  "atime": "2018-10-10T09:01:48.534Z",
  "timestamp": 1539012371650,
  "media_id": "sample121"
}

That said, the problem is it's knonw that node.js fs.readdir may freeze Node I/O Loop when the folder to list has a large number of files, let's say from ten thousands to hundred thousands and more. This is a known issue - see here for more info. There are also plans to improve fs.readdir in a some way, like streaming - see here about this.

In the meanwhile I'm searching for like a patch to this, because my folders are pretty large. Since the problem is the Event Loop get frozen, someone proposed a solution using process.nextTick, that I have ensembled here

FileServer.prototype.listLocalNextTick = function (params) {
            var self = this;
            var options = {
                limit: 100,
                desc: 1
            };
            // override defaults
            for (var attrname in params) { options[attrname] = params[attrname]; }

            // media path is the media folder
            var mediaDir = path.join(self._options.mediaDir, path.sep);
            return new Promise((resolve, reject) => {
                var AsyncArrayProcessor = function (inArray, inEntryProcessingFunction) {
                    var elemNum = 0;
                    var arrLen = inArray.length;
                    var ArrayIterator = function () {
                        inEntryProcessingFunction(inArray[elemNum]);
                        elemNum++;
                        if (elemNum < arrLen) process.nextTick(ArrayIterator);
                    }
                    if (elemNum < arrLen) process.nextTick(ArrayIterator);
                }
                fs.readdir(mediaDir, function (error, results) {
                    if (error) {
                        self.logger.error("FileServer.list error:%s", error);
                        return reject(error);
                    }
                    // cut to max files
                    results = results.slice(0, options.limit);
                    // filter default ext
                    results = results.filter(item => {
                        return (item.indexOf('.mp3') > -1);
                    });
                    var ProcessDirectoryEntry = function (file) {
                        // This may be as complex as you may fit in a single event loop
                        var filePath = path.join(self._options.mediaDir, path.sep, file);
                        var item = {
                            name: file,
                            path: filePath
                        };
                        const fd = fs.openSync(filePath, 'r');
                        var fstat = fs.fstatSync(fd);
                        // file size in bytes
                        item.size = fstat.size;
                        item.sizehr = self.formatSizeUnits(fstat.size);
                        // "Birth Time" Time of file creation. Set once when the file is created. 
                        item.birthtime = fstat.birthtime;
                        // "Modified Time" Time when file data last modified.
                        item.mtime = fstat.mtime;
                        // "Access Time" Time when file data last accessed.
                        item.atime = fstat.atime;
                        item.timestamp = new Date(item.mtime).getTime();
                        item.media_id = path.basename(filePath, '.mp3');
                        // map to file item
                        file = item;
                    }//ProcessDirectoryEntry
                    // LP: fs.readdir() callback is finished, event loop continues...
                    AsyncArrayProcessor(results, ProcessDirectoryEntry);
                    if (options.desc) { // sort by most recent
                        results.sort(function (a, b) {
                            return b.timestamp - a.timestamp;
                        });
                    } else { // sort by older
                        results.sort(function (a, b) {
                            return a.timestamp - b.timestamp;
                        });
                    }
                    return resolve(results);
                });
            });
        }//listLocalNextTick

This seems to avoid the original issue, but I cannot anymore map the files lists to the items with file handler I did before, because when running the AsyncArrayProcessor on the files list, thus the ProcessDirectoryEntry on each file entry the async nature of process.nextTick causes that I cannot get back the results array modified as in the previous listLocal function where I just did an iterative array.map of the results array. How to patch the listLocalNextTick to behave like the listLocal but keeping process.nextTick approach?

[UPDATE]

According to the proposed solution, this is the best implementation so far:

       /**
         * Scan files in directory
         * @param {String} needle 
         * @param {object} options 
         * @returns {nodeStream}
         */
        scanDirStream : function(needle,params) {
            var options = {
                type: 'f',
                name: '*'
            };
            for (var attrname in params) { options[attrname] = params[attrname]; }
            return new Promise((resolve, reject) => {
                var opt=[needle];
                for (var k in options) {
                    var v = options[k];
                    if (!Util.empty(v)) {
                        opt.push('-' + k);
                        opt.push(v);
                    }
                };
                var data='';
                var listing = spawn('find',opt)
                listing.stdout.on('data', _data => {
                    var buff=Buffer.from(_data, 'utf-8').toString();
                    if(buff!='') data+=buff;
                })
                listing.stderr.on('data', error => {
                    return reject(Buffer.from(error, 'utf-8').toString());
                });
                listing.on('close', (code) => {
                    var res = data.split('\n');
                    return resolve(res);
                });
            });

Example of usage:

scanDirStream(mediaRoot,{
        name: '*.mp3'
    })
    .then(results => {
        console.info("files:%d", results);
    })
    .catch(error => {
        console.error("error %s", error);
    });

This can be eventually modified to add a tick callback at every stdout.on event emitted when getting a new file in the directory listening.


回答1:


I have Created a wrapper around find for it but you could use dir or ls in the same way.

const { spawn } = require('child_process');

/**
 * findNodeStream
 * @param {String} dir 
 * @returns {nodeStream}
 */
const findNodeStream = (dir,options) => spawn('find',[dir,options].flat().filter(x=>x));

/**
 * Usage Example:
  let listing = findNodeStream('dir',[options])
  listing.stdout.on('data', d=>console.log(d.toString()))
  listing.stderr.on('data', d=>console.log(d.toString()))
  listing.on('close', (code) => {
    console.log(`child process exited with code ${code}`);
  });
*/

this allows you to stream a directory chunked and not in a whole as fs.readdir does.

Important

NodeJS > 12.11.1 will have async readdir support Landed in cbd8d71 ( https://github.com/nodejs/node/commit/cbd8d715b2286e5726e6988921f5c870cbf74127 ) as fs{Promises}.opendir(), which returns an fs.Dir, which exposes an async iterator. tada

https://nodejs.org/api/fs.html#fs_fspromises_opendir_path_options

const fs = require('fs');

async function print(path) {
  const dir = await fs.promises.opendir(path);
  for await (const dirent of dir) {
    console.log(dirent.name);
  }
}
print('./').catch(console.error);


来源:https://stackoverflow.com/questions/53125592/node-fs-readdir-freezing-in-folders-with-too-many-files

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!