使用 node.js 中的流將大量字符串從內存寫入文件，導致 RAM 使用率高

Question

問題

我正在嘗試使用 Node.js 流將數百萬個字符串寫入文件，但在此過程中 RAM 使用量高達 800MB：

const fs = require('fs')
const walkdir = require('walkdir')

let options = {
  "max_depth": 0,
  "track_inodes": true
}

let dir = "C:/"
let paths = walkdir(dir, options)
var wstream = fs.createWriteStream('C:/test/file.txt')
wstream.write('[')

paths.on('path', function(path, stat) {
  wstream.write(`"${path}",`)
})

paths.on('end', function(path, stat) {
  wstream.write(']')
  wstream.end()

  // Compressing the file after it's written:
  const gzip = require('zlib').createGzip()
  const inp = fs.createReadStream('C:/test/file.txt')
  const out = fs.createWriteStream('C:/test/file.txt.gz')
  inp.pipe(gzip).pipe(out)
})

我也嘗試寫這樣的文件：

...
paths.on('path', function(path, stat) {
  fs.writeFileSync('C:/test/file.txt', path)
})
...

我也試過sync ：

walkdir.sync(dir, options, callback)

function callback(path) {
  let res = wstream.write(`"${path}",`)
  if (!res) {
    wstream.once('drain', callback)
  }
  else {
    callback()
  }
}

但是這兩個產生相同的結果，RAM 使用量上升到 500-800MB

我還嘗試了以下方法，RAM 使用率始終保持在 ~100MB，但它並沒有真正起作用，它將 412kb 寫入文件，然后它繼續使用 CPU 但實際上什么也沒發生（其他方法在 1- 2分鍾）

const readdirp = require('readdirp');

const { Transform } = require('stream');
const entryInfoStream = readdirp({
  root: dir
});

entryInfoStream
  .pipe(new Transform({
    objectMode: true,
    transform(entryInfo, encoding, callback) {
      this.push(entryInfo.path);
      callback();
    },
  }))
  .pipe(wstream);

問題

如何確保流按預期工作（內存使用率低）？
如何在寫入過程中壓縮（gzip）文件？ 還是我只能在寫完之后才能做？

Answer 1

您可以在沒有任何外部依賴的情況下實現整個邏輯，以查看優化的位置。 以下是您可以調整的最小實現：

const fs = require('fs');
const path = require('path');
const zlib = require('zlib');
const stream = require('stream');

// Recursive walk file system
function walk(dir, str, busy) {
    busy.inc();
    fs.readdir(dir, (e, c) => {
        if (!e) {
            c.forEach(f => {
                const p = path.join(dir, f);
                busy.inc();
                fs.stat(p, (e, s) => {
                    if (!e && s.isDirectory()) {
                        walk(p, str, busy);
                    }
                    str.write(p + "\n");
                    busy.dec();
                });
            });
        }
        busy.dec();
    });
}

// Scan FS and write to file
async function scan(dir, dest) {
    return new Promise((resolve) => {
        const gzStr = zlib.createGzip();
        const destStr = fs.createWriteStream(dest);

        let count = 0;
        const busy = {
            inc: () => count++,
            dec: () => {
                count--;
                if (count < 1) {
                    process.nextTick(() => {
                        gzStr.end();
                        gzStr.once('finish', resolve);
                    });
                }
            }
        };

        walk(dir, gzStr, busy, resolve);
        gzStr.pipe(destStr);
    });
}

// Test above code
(async () => {
    // Save gzipped
    await scan(__dirname, './files.txt.gz');

    // Gunip to verify
    const unzipped = fs.createWriteStream('./files.txt');
    fs.createReadStream('./files.txt.gz').pipe(zlib.createGunzip()).pipe(unzipped);

    // End 
    unzipped.on('close', () => console.log('done'));
})();

Answer 2

那是因為你異步做事沒有任何限制。 每個路徑都將為paths.on('path', ...)創建一個新事件，因此所有路徑加載到事件循環的速度比處理它們的速度要快得多，因此內存中的峰值。 您需要限制一次寫入的路徑數量。

您可以使用walkdir.sync對其進行walkdir.sync ，但這意味着您一次只能處理一條路徑。 此外，根據您的實現方式，您最終發現路徑的速度可能仍然比寫入流的速度更快。

更靈活的解決方案是跟蹤您正在處理的並發路徑數，並在達到限制后暫停流。

const fs = require('fs')
const walkdir = require('walkdir')

let options = {
  "max_depth": 0,
  "track_inodes": true
}

let dir = "C:/"
let paths = walkdir(dir, options)
var wstream = fs.createWriteStream('C:/test/file.txt')
wstream.write('[')

const maxPaths = 20; // Maximum amount of concurrent paths allowed to process
let currentPaths = 0; // Current amount of concurrent paths being processed
let deferredPaths = []; // If we somehow exceed the limit, store the excess paths here for later processing. This might not be necessary, depending on how walkdir implements their pause function

const finishPathFlush = () => {
  if (deferredPaths.length > 0) {
    // Process any paths in the deferred queue
    wstream.write('"' + deferredPaths.pop() + '",', finishPathFlush);
  } else {
    // No more work to do, resume walkdir
    --currentPaths;
    paths.resume();
  }
}

paths.on('path', function(path, stat) {
  if (currentPaths < maxPaths) {
    // We have room to process this path
    if (++currentPaths === maxPaths) {
      // If we reach the limit pause walkdir
      paths.pause();
    }
    wstream.write(`"${path}",`, finishPathFlush)
  } else {
    // Got too many paths, defer this path
    deferredPaths.push(path);
  }
})

paths.on('end', function(path, stat) {
  wstream.write(']')
  wstream.end()

  // Compressing the file after it's written:
  const gzip = require('zlib').createGzip()
  const inp = fs.createReadStream('C:/test/file.txt')
  const out = fs.createWriteStream('C:/test/file.txt.gz')
  inp.pipe(gzip).pipe(out)
})

使用 node.js 中的流將大量字符串從內存寫入文件，導致 RAM 使用率高

問題描述

問題

問題

2 個解決方案

解決方案1
2 已采納 2019-03-05 11:47:55

解決方案2
0 2019-03-05 09:58:43

使用 node.js 中的流將大量字符串從內存寫入文件，導致 RAM 使用率高

問題描述

問題

問題

2 個解決方案

解決方案1 2 已采納 2019-03-05 11:47:55

解決方案2 0 2019-03-05 09:58:43

解決方案1
2 已采納 2019-03-05 11:47:55

解決方案2
0 2019-03-05 09:58:43