繁体   English   中英

如何在nodejs上下载大量文件

[英]How to download large number of files on nodejs

我有一个 json 文件,其中包含大约 20000 到 100000 个链接,看起来像这样

[{
    "file_name": "Blessed_Titles.jpg",
    "url": "https://i.imgur.com/FRDibHa.jpg",
    "downloadId": "6r44r4k340rvvr"

}]

有什么方法可以一次并行下载大约 100 个? 下载 1000 个链接时,我会收到任何警告或错误吗? 现在我正在使用顺序下载,但我不确定它是否适合如此大量的链接。

这是我目前下载的方式

async function downloadALL(ultUrls) {
  let i = 1;
  const len = ultUrls.length;
  for (ult of ultUrls) {

    await downloadFile(ult, i, len)
      .then(() => i++)
      .catch(err => console.log(err));
  }


}



function downloadFile(ult, i, len) {
  return new Promise((resolve, reject, cb) => {
    console.log('Downloading File: () ', ult.file_name);
    const download = {
      file: {},
    };

    let percentage = 0;
    const percentage2 = ((i / len) * 100).toFixed(0);

    download.file.name = ult.file_name;

    download.file.percentage = percentage;
    download.file.downloadId = ult.downloadId;
    download.percentage = percentage2;
    // console.log(download);
    // let console_message = download;


    let request = (ult.url.substr(0, 5) === 'https' ? https : http)
      .get(ult.url, function(response) {
        const lent = parseInt(response.headers['content-length'], 10);

        let body = '';
        let cur = 0;

        const total = lent / 1048576; // 1048576 - bytes in  1Megabyte

        response.on('data', function(chunk) {
          body += chunk;
          cur += chunk.length;
          percentage = ((100.0 * cur) / lent).toFixed(0);
          download.file.percentage = percentage;
          mainWindow.webContents.send('download-info', download);
        });

        const file = utility.writeFile(ult.file_name, dir);
        response.pipe(file);
        file.on('error', function(err) {
          console.log(`ERROR:${  err}`);
          file.read();
        });
        file.on('finish', function() {
          console.log('File downloaded');
          return resolve(file.close(cb)); // close() is async, call cb after close completes.
        });
      })
      .on('error', function(err) {
        // Handle errors
        return reject(err);
      });
  });

因此,既然您提到了并行,NodeJS 中的常用方法是使用子进程并根据可用的计算资源生成多个并行线程。

这是一个伪代码,您可以参考它来创建解决方案。

// parent.js
var child_process = require('child_process');

var numchild = require('os').cpus().length;
var done = 0;
var filesListJSON = [{
    "file_name": "Blessed_Titles.jpg",
    "url": "https://i.imgur.com/FRDibHa.jpg",
    "downloadId": "6r44r4k340rvvr"
}, {
    "file_name": "Blessed_Titles2.jpg",
    "url": "https://i.imgur.com/FRDibHa2.jpg",
    "downloadId": "6r44r4k340rvvr"
}, {
    "file_name": "Blessed_Titles3.jpg",
    "url": "https://i.imgur.com/FRDibHa3.jpg",
    "downloadId": "6r44r4k340rvvr"
}];

// split the array into number of parallel threads avalible
var chunks = _.chunk(filesListJSON, numchild);

for (var i = 0; i < numchild; i++) {
    var
    var child = child_process.fork('./child');
    //send the chunk of the list to respective thread. 
    child.send(chunks[i]);
    //ps please check the count and logic for yourself I have not tested this.
    child.on('message', function (message) {
        console.log('[parent] received message from child:', message);
        done++;
        if (done === numchild) {
            console.log('[parent] received all results');
      ...
    }
  });
}

// child.js
process.on('message', function (list) {
    console.log('[child] received message from server:', message);
    downloadFiles(list, function (done) {
        console.log("Done  downloading files : " + list.length);
        process.send({
            child: process.pid,
            result: done
        });
        process.disconnect();
    });
});

function downloadFiles(list, cb) {
    //loop over list
    //logic to download files
    //cb(true)
}

有关使用的逻辑的更多详细信息,请参阅此链接

另外,我使用了 lodash 库中的chuck function 来拆分数组进行处理。 https://lodash.com/docs/3.10.1#chunk

我建议使用bluebird 这个 Promise 库有一个批量承诺并发解决方案。

这是他们教程的链接: http://bluebirdjs.com/docs/api/promise.map.html

这是针对您的案例的bluebird代码解决方案:

// don't forget to run `npm install bluebird` first
const Promise = require('bluebird');

async function downloadAll(ultUrls) {
// The concurrency property here represents the number of promises that will be allowed to run at the same time
// You can surround this line with try/catch scope if you want to
  await Promise.map(ultUrls, downloadFile, {concurrency: 100});
}

// Here you no longer need the i and len parameters
function downloadFile() {
  // Code change needed here stop using the i and len parameters 
}

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM