简体   繁体   中英

Read a large file N lines at a time in Node.JS

I have a file with 65,000,000 lines, that is about 2gb in size.

I want to read this file in N lines at a time, perform a db insert operation, and then read the next N, with N being, say, 1000 in this case. Insert order doesn't matter, so synchronous is fine.

What's the best way of doing this? I've only found was to either load in 1 line at a time, or methods that read the whole file into memory. Sample code below, that I've been using to read the file one line at a time. :

var singleFileParser = (file, insertIntoDB) => {
    var lr = new LineByLineReader(file);
    lr.on('error', function(err) {
        // 'err' contains error object
        console.error(err);
        console.error("Error reading file!");
    });

    lr.on('line', function(line) {
        insertIntoDB(line);
    // 'line' contains the current line without the trailing newline character.
    });

    lr.on('end', function() {
        // All lines are read, file is closed now.
    });
};

Something like this should do

var cnt = 0;
var tenLines = [];
lr.on('line', function(line) {
    tenLines.push(line);
    if (++cnt >= 10) {
         lr.pause();
         // prepare your SQL statements from tenLines
         dbInsert(<yourSQL>, function(error, returnVal){
            cnt = 0;
            tenLines = [];
            lr.resume();
        });
     }
});

Lines can only be parsed one at a time by someone. So, if you want 10 at once, then you just collect them one at a time until you have collected 10 and then process the 10.

I did not think Jarek's code quite worked right so here's a different version that collects 10 lines into an array and then calls dbInsert() :

var tenLines = [];
lr.on('line', function(line) {
    tenLines.push(line);
    if (tenLines.length === 10) {
        lr.pause();
        dbInsert(<yourSQL>, function(error, returnVal){
            if (error) {
                // some sort of error handling here
            }
            tenLines = [];
            lr.resume();
        });
     }
});
// process last set of lines in the tenLines buffer (if any)
lr.on('end', function() {
    if (tenLines.length !== 0) {
        // process last set of lines
        dbInsert(...);
    }
});

Jarek's version seems to call dbInsert() on every line event rather than only every 10th line event and did not process any left over lines at the end of the file if they weren't a perfect multiple of 10 lines long.

This is my solution inside an async function:

let multipleLines = [];
const filepath = '<file>';
const numberLines = 50;

const lineReader = require('readline').createInterface({
    input: require('fs').createReadStream(filepath)
});

// process lines by numberLines
for await (const line of lineReader) {
    multipleLines.push(line);
    if (multipleLines.length === numberLines) {
        await dbInsert();
        multipleLines = [];
    }
}
// process last set of lines (if any)
if (multipleLines.length !== 0) {
    await dbInsert();
}

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM