Read n lines of a big text file

后端 未结 3 517
囚心锁ツ
囚心锁ツ 2020-12-29 11:58

The smallest file I have has > 850k lines and every line is of unknown length. The goal is to read n lines from this file in the browser. Reading it fully is no

相关标签:
3条回答
  • 2020-12-29 12:23

    Streams are the feature!
    The whatwg team are working out the last flux about writable + readable streams and are soon ready. But until then there is a web-stream-polyfill that you can use. They are working on a way to get a ReadableStream from blob's as well [1]. But i have also created a way to get the blob in a streaming fashion already with: Screw-FileReader

    Yesterday i also created a simpel port of node-byline to work with web streams instead

    So this could be quite simple as this:

    // Simulate a file
    var csv =
    `apple,1,$1.00
    banana,4,$0.20
    orange,3,$0.79`
    
    var file = new Blob([csv])
    
    var n = 0
    var controller
    var decoder = new TextDecoder
    var stdout = new WritableStream({
      start(c) {
          controller = c
        },
        write(chunk, a) {
          // Calling controller.error will also put the byLine in an errored state
          // Causing the file stream to stop reading more data also
          if (n == 1) controller.error("don't need more lines")
          chunk = decoder.decode(chunk)
          console.log(`chunk[${n++}]: ${chunk}`)
        }
    })
    
    file
      .stream()
      .pipeThrough(byLine())
      // .pipeThrough(new TextDecoder) something like this will work eventually
      .pipeTo(stdout)
    <script src="https://cdn.rawgit.com/creatorrr/web-streams-polyfill/master/dist/polyfill.min.js"></script>
    <script src="https://cdn.rawgit.com/jimmywarting/Screw-FileReader/master/index.js"></script>
    
    <!-- after a year or so you only need byLine -->
    <script src="https://cdn.rawgit.com/jimmywarting/web-byline/master/index.js"></script>

    0 讨论(0)
  • 2020-12-29 12:34

    I needed to read 250MB utf-8 encoded files in browser. My solution was to write C# like TextReader class that gave me async stream like behaviour.


    TextReader class:

    class TextReader {
        CHUNK_SIZE = 8192000; // I FOUND THIS TO BE BEST FOR MY NEEDS, CAN BE ADJUSTED
        position = 0;
        length = 0;
    
        byteBuffer = new Uint8Array(0);
    
        lines = [];
        lineCount = 0;
        lineIndexTracker = 0;
    
        fileReader = new FileReader();
        textDecoder = new TextDecoder(`utf-8`);
    
        get allCachedLinesAreDispatched() {
            return !(this.lineIndexTracker < this.lineCount);
        }
    
        get blobIsReadInFull() {
            return !(this.position < this.length);
        }
    
        get bufferIsEmpty() {
            return this.byteBuffer.length === 0;
        }
    
        get endOfStream() {
            return this.blobIsReadInFull && this.allCachedLinesAreDispatched && this.bufferIsEmpty;
        }
    
        constructor(blob) {
            this.blob = blob;
            this.length = blob.size;
        }
    
        blob2arrayBuffer(blob) {
            return new Promise((resolve, reject) => {
                this.fileReader.onerror = reject;
                this.fileReader.onload = () => {
                    resolve(this.fileReader.result);
                };
    
                this.fileReader.readAsArrayBuffer(blob);
            });
        }
    
        read(offset, count) {
            return new Promise(async (resolve, reject) => {
                if (!Number.isInteger(offset) || !Number.isInteger(count) || count < 1 || offset < 0 || offset > this.length - 1) {
                    resolve(new ArrayBuffer(0));
                    return
                }
    
                let endIndex = offset + count;
    
                if (endIndex > this.length) endIndex = this.length;
    
                let blobSlice = this.blob.slice(offset, endIndex);
    
                resolve(await this.blob2arrayBuffer(blobSlice));
            });
        }
    
        readLine() {
            return new Promise(async (resolve, reject) => {
    
                if (!this.allCachedLinesAreDispatched) {
                    resolve(this.lines[this.lineIndexTracker++] + `\n`);
                    return;
                }
    
                while (!this.blobIsReadInFull) {
                    let arrayBuffer = await this.read(this.position, this.CHUNK_SIZE);
                    this.position += arrayBuffer.byteLength;
    
                    let tempByteBuffer = new Uint8Array(this.byteBuffer.length + arrayBuffer.byteLength);
                    tempByteBuffer.set(this.byteBuffer);
                    tempByteBuffer.set(new Uint8Array(arrayBuffer), this.byteBuffer.length);
    
                    this.byteBuffer = tempByteBuffer;
    
                    let lastIndexOfLineFeedCharacter = this.byteBuffer.lastIndexOf(10); // LINE FEED CHARACTER (\n) IS ONE BYTE LONG IN UTF-8 AND IS 10 IN ITS DECIMAL FORM
    
                    if (lastIndexOfLineFeedCharacter > -1) {
                        let lines = this.textDecoder.decode(this.byteBuffer).split(`\n`);
                        this.byteBuffer = this.byteBuffer.slice(lastIndexOfLineFeedCharacter + 1);
    
                        let firstLine = lines[0];
    
                        this.lines = lines.slice(1, lines.length - 1);
                        this.lineCount = this.lines.length;
                        this.lineIndexTracker = 0;
    
                        resolve(firstLine + `\n`);
                        return;
                    }
                }
    
                if (!this.bufferIsEmpty) {
                    let line = this.textDecoder.decode(this.byteBuffer);
                    this.byteBuffer = new Uint8Array(0);
                    resolve(line);
                    return;
                }
    
                resolve(null);
            });
        }
    }
    

    Usage:

    document.getElementById("read").onclick = async () => {
        let file = document.getElementById("fileInput").files[0];
        let textReader = new TextReader(file);
    
        while(true) {
            let line = await textReader.readLine();
            if(line === null) break;
            // PROCESS LINE
        }
    
        // OR
    
        while (!textReader.endOfStream) {
            let line = await textReader.readLine();
            // PROCESS LINE
        }
    };
    

    Performance:

    I was able to read single 250MB utf-8 encoded text file consisting of 1,398,258 rows in ~1.5s with JS Heap size not exceeding 20MB. In comparison, if I read same file in one go and then split resulting string by \n, it still takes ~1.5s however, JS Heap shoots to 230MB.

    0 讨论(0)
  • 2020-12-29 12:44

    The logic is very similar to what I wrote in my answer to filereader api on big files, except you need to keep track of the number of lines that you have processed so far (and also the last line read so far, because it may not have ended yet). The next example works for any encoding that is compatible with UTF-8; if you need another encoding look at the options for the TextDecoder constructor.

    If you are certain that the input is ASCII (or any other single-byte encoding), then you can also skip the use of TextDecoder and directly read the input as text using the FileReader's readAsText method.

    // This is just an example of the function below.
    document.getElementById('start').onclick = function() {
        var file = document.getElementById('infile').files[0];
        if (!file) {
            console.log('No file selected.');
            return;
        }
        var maxlines = parseInt(document.getElementById('maxlines').value, 10);
        var lineno = 1;
        // readSomeLines is defined below.
        readSomeLines(file, maxlines, function(line) {
            console.log("Line: " + (lineno++) + line);
        }, function onComplete() {
            console.log('Read all lines');
        });
    };
    
    /**
     * Read up to and including |maxlines| lines from |file|.
     *
     * @param {Blob} file - The file to be read.
     * @param {integer} maxlines - The maximum number of lines to read.
     * @param {function(string)} forEachLine - Called for each line.
     * @param {function(error)} onComplete - Called when the end of the file
     *     is reached or when |maxlines| lines have been read.
     */
    function readSomeLines(file, maxlines, forEachLine, onComplete) {
        var CHUNK_SIZE = 50000; // 50kb, arbitrarily chosen.
        var decoder = new TextDecoder();
        var offset = 0;
        var linecount = 0;
        var linenumber = 0;
        var results = '';
        var fr = new FileReader();
        fr.onload = function() {
            // Use stream:true in case we cut the file
            // in the middle of a multi-byte character
            results += decoder.decode(fr.result, {stream: true});
            var lines = results.split('\n');
            results = lines.pop(); // In case the line did not end yet.
            linecount += lines.length;
        
            if (linecount > maxlines) {
                // Read too many lines? Truncate the results.
                lines.length -= linecount - maxlines;
                linecount = maxlines;
            }
        
            for (var i = 0; i < lines.length; ++i) {
                forEachLine(lines[i] + '\n');
            }
            offset += CHUNK_SIZE;
            seek();
        };
        fr.onerror = function() {
            onComplete(fr.error);
        };
        seek();
        
        function seek() {
            if (linecount === maxlines) {
                // We found enough lines.
                onComplete(); // Done.
                return;
            }
            if (offset !== 0 && offset >= file.size) {
                // We did not find all lines, but there are no more lines.
                forEachLine(results); // This is from lines.pop(), before.
                onComplete(); // Done
                return;
            }
            var slice = file.slice(offset, offset + CHUNK_SIZE);
            fr.readAsArrayBuffer(slice);
        }
    }
    Read <input type="number" id="maxlines"> lines from
    <input type="file" id="infile">.
    <input type="button" id="start" value="Print lines to console">

    0 讨论(0)
提交回复
热议问题