Read n lines of a big text file

后端 未结 3 516
囚心锁ツ
囚心锁ツ 2020-12-29 11:58

The smallest file I have has > 850k lines and every line is of unknown length. The goal is to read n lines from this file in the browser. Reading it fully is no

3条回答
  •  礼貌的吻别
    2020-12-29 12:34

    I needed to read 250MB utf-8 encoded files in browser. My solution was to write C# like TextReader class that gave me async stream like behaviour.


    TextReader class:

    class TextReader {
        CHUNK_SIZE = 8192000; // I FOUND THIS TO BE BEST FOR MY NEEDS, CAN BE ADJUSTED
        position = 0;
        length = 0;
    
        byteBuffer = new Uint8Array(0);
    
        lines = [];
        lineCount = 0;
        lineIndexTracker = 0;
    
        fileReader = new FileReader();
        textDecoder = new TextDecoder(`utf-8`);
    
        get allCachedLinesAreDispatched() {
            return !(this.lineIndexTracker < this.lineCount);
        }
    
        get blobIsReadInFull() {
            return !(this.position < this.length);
        }
    
        get bufferIsEmpty() {
            return this.byteBuffer.length === 0;
        }
    
        get endOfStream() {
            return this.blobIsReadInFull && this.allCachedLinesAreDispatched && this.bufferIsEmpty;
        }
    
        constructor(blob) {
            this.blob = blob;
            this.length = blob.size;
        }
    
        blob2arrayBuffer(blob) {
            return new Promise((resolve, reject) => {
                this.fileReader.onerror = reject;
                this.fileReader.onload = () => {
                    resolve(this.fileReader.result);
                };
    
                this.fileReader.readAsArrayBuffer(blob);
            });
        }
    
        read(offset, count) {
            return new Promise(async (resolve, reject) => {
                if (!Number.isInteger(offset) || !Number.isInteger(count) || count < 1 || offset < 0 || offset > this.length - 1) {
                    resolve(new ArrayBuffer(0));
                    return
                }
    
                let endIndex = offset + count;
    
                if (endIndex > this.length) endIndex = this.length;
    
                let blobSlice = this.blob.slice(offset, endIndex);
    
                resolve(await this.blob2arrayBuffer(blobSlice));
            });
        }
    
        readLine() {
            return new Promise(async (resolve, reject) => {
    
                if (!this.allCachedLinesAreDispatched) {
                    resolve(this.lines[this.lineIndexTracker++] + `\n`);
                    return;
                }
    
                while (!this.blobIsReadInFull) {
                    let arrayBuffer = await this.read(this.position, this.CHUNK_SIZE);
                    this.position += arrayBuffer.byteLength;
    
                    let tempByteBuffer = new Uint8Array(this.byteBuffer.length + arrayBuffer.byteLength);
                    tempByteBuffer.set(this.byteBuffer);
                    tempByteBuffer.set(new Uint8Array(arrayBuffer), this.byteBuffer.length);
    
                    this.byteBuffer = tempByteBuffer;
    
                    let lastIndexOfLineFeedCharacter = this.byteBuffer.lastIndexOf(10); // LINE FEED CHARACTER (\n) IS ONE BYTE LONG IN UTF-8 AND IS 10 IN ITS DECIMAL FORM
    
                    if (lastIndexOfLineFeedCharacter > -1) {
                        let lines = this.textDecoder.decode(this.byteBuffer).split(`\n`);
                        this.byteBuffer = this.byteBuffer.slice(lastIndexOfLineFeedCharacter + 1);
    
                        let firstLine = lines[0];
    
                        this.lines = lines.slice(1, lines.length - 1);
                        this.lineCount = this.lines.length;
                        this.lineIndexTracker = 0;
    
                        resolve(firstLine + `\n`);
                        return;
                    }
                }
    
                if (!this.bufferIsEmpty) {
                    let line = this.textDecoder.decode(this.byteBuffer);
                    this.byteBuffer = new Uint8Array(0);
                    resolve(line);
                    return;
                }
    
                resolve(null);
            });
        }
    }
    

    Usage:

    document.getElementById("read").onclick = async () => {
        let file = document.getElementById("fileInput").files[0];
        let textReader = new TextReader(file);
    
        while(true) {
            let line = await textReader.readLine();
            if(line === null) break;
            // PROCESS LINE
        }
    
        // OR
    
        while (!textReader.endOfStream) {
            let line = await textReader.readLine();
            // PROCESS LINE
        }
    };
    

    Performance:

    I was able to read single 250MB utf-8 encoded text file consisting of 1,398,258 rows in ~1.5s with JS Heap size not exceeding 20MB. In comparison, if I read same file in one go and then split resulting string by \n, it still takes ~1.5s however, JS Heap shoots to 230MB.

提交回复
热议问题