简体   繁体   中英

How to concurrent download files using cheerio and nodejs?

I have a website with multiple pages, each page lists download links which I want to scrap and download. I have few issues with it:

  • My script only downloads about 4-5 files and getting stuck.
  • I would like to concurrently download as much files as my CPU can.
  • I got stuck with maximum event emitters, I don't understand why is that so I just go
  • How to follow redirects purely using request module (without follow-redirects)?
  • How to download the file like the browser does without mentioning it's name? there is no content-disposition but I think the browser follow redirects and the redirected URL has the filename in it's path.

My current code looks like so:

var request = require('request');
var cheerio = require('cheerio');
var https = require('follow-redirects').https;
require('events').EventEmitter.prototype._maxListeners = 1000;

for(var i = 1; i <= 10000; i++) {
    (function(i){
        url = 'http://mywebsite.com/files?page=' + i;
        request(url, gotHTML)
    })(i);
}

function gotHTML(err, resp, html) {
    var $ = cheerio.load(html);
    $('.file-header').each(function() {
        var data = $(this);
        var fileLink = data.children().first().children().first().attr('href');
        var fileName = fileLink.substring(10);
        var downloadLink = 'https://mywebsite.com/api/download/' + fileName;
        download(downloadLink, function() {
            console.log('downloaded');
        })
    })
}

function download(url, cb) {
  var request = https.get(url, function(response) {
    var location = request.res.headers.location;
    console.log(location);
    location = location.split('/').pop();
    console.log(location);
    var file = fs.createWriteStream(location);
    response.pipe(file);
    file.on('finish', function() {
      file.close(cb);
    });
  });
}

The default HTTP/HTTPS Agent only uses a maximum of 5 sockets ( maxSockets ) for requests to the same origin. So this could be causing some issues for you.

Try changing this:

var request = https.get(url, function(response) {

to this:

var options = require('url').parse(url);
options.agent = false; // or use a custom https.Agent with a higher `maxSockets`
var request = https.get(options, function(response) {

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM