[英]Node.js: Count the number of lines in a file
I have large text files, which range between 30MB
and 10GB
.我有很大的文本文件,范围在
30MB
到10GB
之间。 How can I count the number of lines in a file using Node.js
?如何使用
Node.js
文件中的行数?
I have these limitations:我有这些限制:
solution without using wc:不使用 wc 的解决方案:
var i;
var count = 0;
require('fs').createReadStream(process.argv[2])
.on('data', function(chunk) {
for (i=0; i < chunk.length; ++i)
if (chunk[i] == 10) count++;
})
.on('end', function() {
console.log(count);
});
it's slower, but not that much you might expect - 0.6s for 140M+ file including node.js loading & startup time它更慢,但没有你期望的那么多 - 140M+ 文件需要 0.6 秒,包括 node.js 加载和启动时间
>time node countlines.js video.mp4
619643
real 0m0.614s
user 0m0.489s
sys 0m0.132s
>time wc -l video.mp4
619643 video.mp4
real 0m0.133s
user 0m0.108s
sys 0m0.024s
>wc -c video.mp4
144681406 video.mp4
You could do this as the comments suggest using wc
您可以按照评论建议使用
wc
执行此操作
var exec = require('child_process').exec;
exec('wc /path/to/file', function (error, results) {
console.log(results);
});
We can use indexOf to let the VM find the newlines:我们可以使用indexOf让 VM 找到换行符:
function countFileLines(filePath){
return new Promise((resolve, reject) => {
let lineCount = 0;
fs.createReadStream(filePath)
.on("data", (buffer) => {
let idx = -1;
lineCount--; // Because the loop will run once for idx=-1
do {
idx = buffer.indexOf(10, idx+1);
lineCount++;
} while (idx !== -1);
}).on("end", () => {
resolve(lineCount);
}).on("error", reject);
});
};
What this solution does is that it finds the position of the first newline using .indexOf
.这个解决方案的作用是它使用
.indexOf
找到第一个换行符的位置。 It increments lineCount
, then it finds the next position.它增加
lineCount
,然后找到下一个位置。 The second parameter to .indexOf
tells where to start looking for newlines. .indexOf
的第二个参数告诉从哪里开始寻找换行符。 This way we are jumping over large chunks of the buffer.这样我们就跳过了大块的缓冲区。 The while loop will run once for every newline, plus one.
while 循环将为每个换行符运行一次,加一。
We are letting the Node runtime do the searching for us which is implemented on a lower level and should be faster.我们让 Node 运行时为我们进行搜索,这在较低级别上实现并且应该更快。
On my system this is about twice as fast as running a for
loop over the buffer length on a large file (111 MB).在我的系统上,这大约是在大文件 (111 MB) 的缓冲区长度上运行
for
循环的两倍。
since iojs 1.5.0 there is Buffer#indexOf()
method, using it to compare to Andrey Sidorov' answer:由于 iojs 1.5.0 有
Buffer#indexOf()
方法,用它来比较 Andrey Sidorov 的回答:
ubuntu@server:~$ wc logs
7342500 27548750 427155000 logs
ubuntu@server:~$ time wc -l logs
7342500 logs
real 0m0.180s
user 0m0.088s
sys 0m0.084s
ubuntu@server:~$ nvm use node
Now using node v0.12.1
ubuntu@server:~$ time node countlines.js logs
7342500
real 0m2.559s
user 0m2.200s
sys 0m0.340s
ubuntu@server:~$ nvm use iojs
Now using node iojs-v1.6.2
ubuntu@server:~$ time iojs countlines2.js logs
7342500
real 0m1.363s
user 0m0.920s
sys 0m0.424s
ubuntu@server:~$ cat countlines.js
var i;
var count = 0;
require('fs').createReadStream(process.argv[2])
.on('data', function(chunk) {
for (i=0; i < chunk.length; ++i)
if (chunk[i] == 10) count++;
})
.on('end', function() {
console.log(count);
});
ubuntu@server:~$ cat countlines2.js
var i;
var count = 0;
require('fs').createReadStream(process.argv[2])
.on('data', function(chunk) {
var index = -1;
while((index = chunk.indexOf(10, index + 1)) > -1) count++
})
.on('end', function() {
console.log(count);
});
ubuntu@server:~$
If you use Node 8 and above, you can use this async/await pattern如果您使用 Node 8 及更高版本,则可以使用此 async/await 模式
const util = require('util');
const exec = util.promisify(require('child_process').exec);
async function fileLineCount({ fileLocation }) {
const { stdout } = await exec(`cat ${fileLocation} | wc -l`);
return parseInt(stdout);
};
// Usage
async someFunction() {
const lineCount = await fileLineCount({ fileLocation: 'some/file.json' });
}
Here is another way without so much nesting.这是另一种没有太多嵌套的方法。
var fs = require('fs');
filePath = process.argv[2];
fileBuffer = fs.readFileSync(filePath);
to_string = fileBuffer.toString();
split_lines = to_string.split("\n");
console.log(split_lines.length-1);
var fs=require('fs');
filename=process.argv[2];
var data=fs.readFileSync(filename);
var res=data.toString().split('\n').length;
console.log(res-1);`
Best solution I've found is using promises, async, and await.我发现的最佳解决方案是使用 promises、async 和 await。 This is also an example of how await for the fulfillment of a promise:
这也是一个如何等待履行承诺的例子:
#!/usr/bin/env node
const fs = require('fs');
const readline = require('readline');
function main() {
function doRead() {
return new Promise(resolve => {
var inf = readline.createInterface({
input: fs.createReadStream('async.js'),
crlfDelay: Infinity
});
var count = 0;
inf.on('line', (line) => {
console.log(count + ' ' + line);
count += 1;
});
inf.on('close', () => resolve(count));
});
}
async function showRead() {
var x = await doRead();
console.log('line count: ' + x);
}
showRead();
}
main();
You can also use indexOf():您还可以使用 indexOf():
var index = -1;
var count = 0;
while ((index = chunk.indexOf(10, index + 1)) > -1) count++;
There is an npm module called count-lines-in-file .有一个名为count-lines-in-file的 npm 模块。 I've been using it for smallish (<1000 lines) files and it's worked great so far.
我一直将它用于较小的(<1000 行)文件,到目前为止效果很好。
Simple solution using readline
:使用
readline
的简单解决方案:
import readline from 'node:readline';
export default async function countLines(input) {
let lineCount = 0;
for await (const _ of readline.createInterface({input, crlfDelay: Infinity})) {
lineCount++;
}
return lineCount;
}
import fs from 'node:fs';
console.log(await countLines(fs.createReadStream('file.txt')));
//=> <number>
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.