The smallest file I have has > 850k lines and every line is of unknown length. The goal is to read n
lines from this file in the browser. Reading it fully is no
I needed to read 250MB utf-8 encoded files in browser. My solution was to write C# like TextReader class that gave me async stream like behaviour.
TextReader class:
class TextReader {
CHUNK_SIZE = 8192000; // I FOUND THIS TO BE BEST FOR MY NEEDS, CAN BE ADJUSTED
position = 0;
length = 0;
byteBuffer = new Uint8Array(0);
lines = [];
lineCount = 0;
lineIndexTracker = 0;
fileReader = new FileReader();
textDecoder = new TextDecoder(`utf-8`);
get allCachedLinesAreDispatched() {
return !(this.lineIndexTracker < this.lineCount);
}
get blobIsReadInFull() {
return !(this.position < this.length);
}
get bufferIsEmpty() {
return this.byteBuffer.length === 0;
}
get endOfStream() {
return this.blobIsReadInFull && this.allCachedLinesAreDispatched && this.bufferIsEmpty;
}
constructor(blob) {
this.blob = blob;
this.length = blob.size;
}
blob2arrayBuffer(blob) {
return new Promise((resolve, reject) => {
this.fileReader.onerror = reject;
this.fileReader.onload = () => {
resolve(this.fileReader.result);
};
this.fileReader.readAsArrayBuffer(blob);
});
}
read(offset, count) {
return new Promise(async (resolve, reject) => {
if (!Number.isInteger(offset) || !Number.isInteger(count) || count < 1 || offset < 0 || offset > this.length - 1) {
resolve(new ArrayBuffer(0));
return
}
let endIndex = offset + count;
if (endIndex > this.length) endIndex = this.length;
let blobSlice = this.blob.slice(offset, endIndex);
resolve(await this.blob2arrayBuffer(blobSlice));
});
}
readLine() {
return new Promise(async (resolve, reject) => {
if (!this.allCachedLinesAreDispatched) {
resolve(this.lines[this.lineIndexTracker++] + `\n`);
return;
}
while (!this.blobIsReadInFull) {
let arrayBuffer = await this.read(this.position, this.CHUNK_SIZE);
this.position += arrayBuffer.byteLength;
let tempByteBuffer = new Uint8Array(this.byteBuffer.length + arrayBuffer.byteLength);
tempByteBuffer.set(this.byteBuffer);
tempByteBuffer.set(new Uint8Array(arrayBuffer), this.byteBuffer.length);
this.byteBuffer = tempByteBuffer;
let lastIndexOfLineFeedCharacter = this.byteBuffer.lastIndexOf(10); // LINE FEED CHARACTER (\n) IS ONE BYTE LONG IN UTF-8 AND IS 10 IN ITS DECIMAL FORM
if (lastIndexOfLineFeedCharacter > -1) {
let lines = this.textDecoder.decode(this.byteBuffer).split(`\n`);
this.byteBuffer = this.byteBuffer.slice(lastIndexOfLineFeedCharacter + 1);
let firstLine = lines[0];
this.lines = lines.slice(1, lines.length - 1);
this.lineCount = this.lines.length;
this.lineIndexTracker = 0;
resolve(firstLine + `\n`);
return;
}
}
if (!this.bufferIsEmpty) {
let line = this.textDecoder.decode(this.byteBuffer);
this.byteBuffer = new Uint8Array(0);
resolve(line);
return;
}
resolve(null);
});
}
}
Usage:
document.getElementById("read").onclick = async () => {
let file = document.getElementById("fileInput").files[0];
let textReader = new TextReader(file);
while(true) {
let line = await textReader.readLine();
if(line === null) break;
// PROCESS LINE
}
// OR
while (!textReader.endOfStream) {
let line = await textReader.readLine();
// PROCESS LINE
}
};
Performance:
I was able to read single 250MB utf-8 encoded text file consisting of 1,398,258 rows in ~1.5s with JS Heap size not exceeding 20MB. In comparison, if I read same file in one go and then split resulting string by \n, it still takes ~1.5s however, JS Heap shoots to 230MB.