With nodejs I want to parse a .csv file of 10000 records and do some operation on each row. I tried using http://www.adaltas.com/projects/node-csv. I couldnt get this to pau
csv-parser
instead of csv-parse
used in some
of the answers above. csv-parser
came around 2 years after
csv-parse
. csv-parser
better, as it is easy to handle headers through it.Install the csv-parser first:
npm install csv-parser
So suppose you have a csv-file like this:
NAME, AGE
Lionel Messi, 31
Andres Iniesta, 34
You can perform the required operation as:
const fs = require('fs');
const csv = require('csv-parser');
fs.createReadStream(inputFilePath)
.pipe(csv())
.on('data', function(data){
try {
console.log("Name is: "+data.NAME);
console.log("Age is: "+data.AGE);
//perform the operation
}
catch(err) {
//error handler
}
})
.on('end',function(){
//some final operation
});
For further reading refer
My current solution uses the async module to execute in series:
var fs = require('fs');
var parse = require('csv-parse');
var async = require('async');
var inputFile='myfile.csv';
var parser = parse({delimiter: ','}, function (err, data) {
async.eachSeries(data, function (line, callback) {
// do something with the line
doSomething(line).then(function() {
// when processing finishes invoke the callback to move to the next one
callback();
});
})
});
fs.createReadStream(inputFile).pipe(parser);
fs = require('fs');
fs.readFile('FILENAME WITH PATH','utf8', function(err,content){
if(err){
console.log('error occured ' +JSON.stringify(err));
}
console.log('Fileconetent are ' + JSON.stringify(content));
})
The node-csv project that you are referencing is completely sufficient for the task of transforming each row of a large portion of CSV data, from the docs at: http://csv.adaltas.com/transform/:
csv()
.from('82,Preisner,Zbigniew\n94,Gainsbourg,Serge')
.to(console.log)
.transform(function(row, index, callback){
process.nextTick(function(){
callback(null, row.reverse());
});
});
From my experience, I can say that it is also a rather fast implementation, I have been working with it on data sets with near 10k records and the processing times were at a reasonable tens-of-milliseconds level for the whole set.
Rearding jurka's stream based solution suggestion: node-csv IS stream based and follows the Node.js' streaming API.
Ok so there are many answers here and I dont think they answer your question which I think is similar to mine.
You need to do an operation like contacting a database or third part api that will take time and is asyncronus. You do not want to load the entire document into memory due to being to large or some other reason so you need to read line by line to process.
I have read into the fs documents and it can pause on reading but using .on('data') call will make it continous which most of these answer use and cause the problem.
UPDATE: I know more info about Streams than I ever wanted
The best way to do this is to create a writable stream. This will pipe the csv data into your writable stream which you can manage asyncronus calls. The pipe will manage the buffer all the way back to the reader so you will not wind up with heavy memory usage
Simple Version
const parser = require('csv-parser');
const stripBom = require('strip-bom-stream');
const stream = require('stream')
const mySimpleWritable = new stream.Writable({
objectMode: true, // Because input is object from csv-parser
write(chunk, encoding, done) { // Required
// chunk is object with data from a line in the csv
console.log('chunk', chunk)
done();
},
final(done) { // Optional
// last place to clean up when done
done();
}
});
fs.createReadStream(fileNameFull).pipe(stripBom()).pipe(parser()).pipe(mySimpleWritable)
Class Version
const parser = require('csv-parser');
const stripBom = require('strip-bom-stream');
const stream = require('stream')
// Create writable class
class MyWritable extends stream.Writable {
// Used to set object mode because we get an object piped in from csv-parser
constructor(another_variable, options) {
// Calls the stream.Writable() constructor.
super({ ...options, objectMode: true });
// additional information if you want
this.another_variable = another_variable
}
// The write method
// Called over and over, for each line in the csv
async _write(chunk, encoding, done) {
// The chunk will be a line of your csv as an object
console.log('Chunk Data', this.another_variable, chunk)
// demonstrate await call
// This will pause the process until it is finished
await new Promise(resolve => setTimeout(resolve, 2000));
// Very important to add. Keeps the pipe buffers correct. Will load the next line of data
done();
};
// Gets called when all lines have been read
async _final(done) {
// Can do more calls here with left over information in the class
console.log('clean up')
// lets pipe know its done and the .on('final') will be called
done()
}
}
// Instantiate the new writable class
myWritable = new MyWritable(somevariable)
// Pipe the read stream to csv-parser, then to your write class
// stripBom is due to Excel saving csv files with UTF8 - BOM format
fs.createReadStream(fileNameFull).pipe(stripBom()).pipe(parser()).pipe(myWritable)
// optional
.on('finish', () => {
// will be called after the wriables internal _final
console.log('Called very last')
})
OLD METHOD:
PROBLEM WITH readable
const csv = require('csv-parser');
const fs = require('fs');
const processFileByLine = async(fileNameFull) => {
let reading = false
const rr = fs.createReadStream(fileNameFull)
.pipe(csv())
// Magic happens here
rr.on('readable', async function(){
// Called once when data starts flowing
console.log('starting readable')
// Found this might be called a second time for some reason
// This will stop that event from happening
if (reading) {
console.log('ignoring reading')
return
}
reading = true
while (null !== (data = rr.read())) {
// data variable will be an object with information from the line it read
// PROCESS DATA HERE
console.log('new line of data', data)
}
// All lines have been read and file is done.
// End event will be called about now so that code will run before below code
console.log('Finished readable')
})
rr.on("end", function () {
// File has finished being read
console.log('closing file')
});
rr.on("error", err => {
// Some basic error handling for fs error events
console.log('error', err);
});
}
You will notice a reading
flag. I have noticed that for some reason right near the end of the file the .on('readable') gets called a second time on small and large files. I am unsure why but this blocks that from a second process reading the same line items.
I used this way:-
var fs = require('fs');
var parse = require('csv-parse');
var csvData=[];
fs.createReadStream(req.file.path)
.pipe(parse({delimiter: ':'}))
.on('data', function(csvrow) {
console.log(csvrow);
//do something with csvrow
csvData.push(csvrow);
})
.on('end',function() {
//do something with csvData
console.log(csvData);
});