简体   繁体   中英

Trouble using nested callbacks in NodeJS

I'm writing a program that scrapes a site for links, then scrapes these links for information. In order to scrape the site, it is necessary to log in first. And so the order is: Log in -> Scrape the index for links -> Scrape the links for info

The callback to the login function prints an empty array { results: [], hasMore: true } , so something is wrong with my code (the scraping part works):

var request = require('request');
var request = request.defaults({jar: true}); // necessary for persistent login
var cheerio = require('cheerio');

var url1 = "https://example.org/torrents/browse/index/";
var loginUrl = "https://example.org/user/account/login/";

var credentials = {
    username: 'user1',
    password: 'passpass'
};

login(function (result) {
    console.log(result);
});

function login(callback) {
    request.post({
        uri: loginUrl,
        headers: { 'content-type': 'application/x-www-form-urlencoded' },
        body: require('querystring').stringify(credentials)
    }, function(err, res, body){
        if(err) {
            console.log("Login error");
            return;
        }
        scrapeTorrents(url1, function (result) {
            callback(result);
        });
    });
}

function scrapeTorrents(url, callback) {
    request(url, function(err, res, body) {
        if(err) {
            console.log("Main scrape error");
            return;
        }
        var links = []
        var $ = cheerio.load(body);
        $('span.title').each(function(i, element){
            var title = $(this);
            var a = $(this).children().eq(0);
            var detailsUrl = a.attr('href');
            //console.log(detailsUrl);
            links.push(detailsUrl);
        });
         scrapeTorrentDetails(links, function (result) {
             callback(result);
         });
    });
}

function scrapeTorrentDetails(links, callback) {
    var results = [];

    function getDetails(url) {
        request(url, function(err, res, body) {
                if(err) {
                    console.log("Detail scrape error");
                    return;
                }
                console.log("Scraping: " + url);
                var $ = cheerio.load(body);
                var tds = $('td');
                var title = $(tds).get(1).firstChild.data;
                var hash = $(tds).get(3).firstChild.data.trim();
                var size = $(tds).get(9).firstChild.data;
                //  console.log(tds.length);
                if (tds.length > 23) {
                    var rlsDate = $(tds).get(23).firstChild.data || '';;
                    var genres = $(tds).get(27).firstChild.data || '';;
                    var runtime = $(tds).get(31).firstChild.data || '';;
                    if ( $(tds).get(33).firstChild != null) {
                        var plot = $(tds).get(33).firstChild.data || '';;
                    }
                    var rating = $('#imdb_rating').parent().next().text() || '';; // of 10
                    var imdb_id = $('[name=imdbID]').get(0).attribs.value || '';;
                    var cover = $('#cover').children().eq(0).get(0).attribs.href || '';;
                    var thumb = $('[alt=Cover]').get(0).attribs.src || '';;
                    if (typeof cover == 'undefined') {
                        cover = thumb;
                    }
                } else {
                    var rlsDate = "notfound";
                    var genres = "notfound";
                    var runtime = "notfound";
                    var plot = "notfound";
                    var rating = "notfound"; // of 10
                    var imdb_id = "notfound";
                    var cover = "notfound";
                    var thumb = "notfound";
                }

                var movie = {
                    type: 'movie',
                    imdb_id: imdb_id,
                    title: title,
                    year: rlsDate,
                    genre: genres,
                    rating: rating,
                    runtime: runtime,
                    image: thumb,
                    cover: cover,
                    synopsis: plot,
                    torrents: {
                        magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
                        filesize: size
                    }
                };

                results.push(movie);
            });
    }

    for (var i=0; i<links.length; i++){
            getDetails("https://example.org" + links[i]);
    }

    callback( {
        results: results,
        hasMore: true
    });
}

Maybe Q promises would be better. How would I implement that in the code above?

If you're wondering what the code is for, I'm planning to modify Popcorn-time to use another torrent-tracker (without an API).

Thanks

A main problem is with this code:

for (var i=0; i<links.length; i++){
        getDetails("https://example.org" + links[i]);
}

callback( {
    results: results,
    hasMore: true
});

getDetails() is async, but you just call it links.length times and move on - acting like they have all completed. So, none of the requests in getDetails() is done before you call the callback and try to pass the results. But, none of the results have yet been filled in so they will be empty.

You have all these other nested callbacks everywhere through your code (as required), yet you dropped the ball in this one place. You need to know when all the getDetails() calls are done before you call the final callback with the results.

In addition, you also have to decide if you're OK calling all the getDetails() calls in parallel (all in flight at once) or if what you really want to do is to call one, wait for it to finish, then call the next, etc... Right now you are putting them all in-flight at once which can work if the destination server doesn't object to that many requests all at once.


There are several potential strategies for fixing this.

  1. Add a callback to getDetails() and then keep a count of when you've gotten links.length callbacks from getDetails() and only when the entire count has finished so you call the final callback.

  2. Change getDetails() to return a promise. You can then use something like links.map(getDetails) to create an array of promises that you can then use Promise.all() with to know when they are all done.

Personally, I would change all of your code to use promises and I'd use the Bluebird promises library for it's extra features such as Promise.map() to make this even simpler.

Here's a fix that adds a callback to getDetails() and then counts how many are done:

function scrapeTorrentDetails(links, callback) {
    var results = [];

    function getDetails(url, done) {
        request(url, function(err, res, body) {
                if(err) {
                    console.log("Detail scrape error");
                    done(err);
                    return;
                }
                console.log("Scraping: " + url);
                var $ = cheerio.load(body);
                var tds = $('td');
                var title = $(tds).get(1).firstChild.data;
                var hash = $(tds).get(3).firstChild.data.trim();
                var size = $(tds).get(9).firstChild.data;
                //  console.log(tds.length);
                if (tds.length > 23) {
                    var rlsDate = $(tds).get(23).firstChild.data || '';;
                    var genres = $(tds).get(27).firstChild.data || '';;
                    var runtime = $(tds).get(31).firstChild.data || '';;
                    if ( $(tds).get(33).firstChild != null) {
                        var plot = $(tds).get(33).firstChild.data || '';;
                    }
                    var rating = $('#imdb_rating').parent().next().text() || '';; // of 10
                    var imdb_id = $('[name=imdbID]').get(0).attribs.value || '';;
                    var cover = $('#cover').children().eq(0).get(0).attribs.href || '';;
                    var thumb = $('[alt=Cover]').get(0).attribs.src || '';;
                    if (typeof cover == 'undefined') {
                        cover = thumb;
                    }
                } else {
                    var rlsDate = "notfound";
                    var genres = "notfound";
                    var runtime = "notfound";
                    var plot = "notfound";
                    var rating = "notfound"; // of 10
                    var imdb_id = "notfound";
                    var cover = "notfound";
                    var thumb = "notfound";
                }

                var movie = {
                    type: 'movie',
                    imdb_id: imdb_id,
                    title: title,
                    year: rlsDate,
                    genre: genres,
                    rating: rating,
                    runtime: runtime,
                    image: thumb,
                    cover: cover,
                    synopsis: plot,
                    torrents: {
                        magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
                        filesize: size
                    }
                };

                results.push(movie);
                done();
            });
    }

    var doneCnt = 0;
    for (var i=0; i<links.length; i++){
        getDetails("https://example.org" + links[i], function() {
            ++doneCnt;
            if (doneCnt === links.length) {
                callback( {
                    results: results,
                    hasMore: true
                });
            }
        });
    }

}

The following is the given sample code rewritten to use bind, a custom this object and a count of the requests that have yet to complete (I think promises obscure the execution path).

The reason that the callback is returning an empty array seems to be that there are no spans in the document with a title attribute, so as a result no further requests are triggered.

 var request = require('request').defaults({ jar: true }), // necessary for persistent login cheerio = require('cheerio'), process = require('process'), url1 = "https://example.org/torrents/browse/index/", loginUrl = "https://example.org/user/account/login/", login = function(callback) { request.post({ uri: loginUrl, headers: { 'content-type': 'application/x-www-form-urlencoded' }, body: require('querystring').stringify({ username: 'user1', password: 'passpass' }) }, fna.bind({ callback: callback })); }, fna = function(err, res, body) { if (err) { console.log("Login error"); return; } request(url1, fnb.bind(this)); }, fnb = function(err, res, body) { if (err) { console.log("Main scrape error"); return; } var $ = cheerio.load(body), links = [], fnd = fne.bind(this); $('span.title').each(function() { links.push($(this).children().first().attr('href')); }); this.results = []; this.resultCount = links.length; if (this.resultCount) { fnd = fnc.bind(this); for (var i = 0; i < links.length; i++) { request("https://example.org" + links[i], fnd); } } else { process.nextTick(fnd); } }, fnc = function(err, res, body) { if (err) { console.log("Detail scrape error"); return; } console.log("Scraping: " + url); var $ = cheerio.load(body), tds = $('td'), title = $(tds).get(1).firstChild.data, hash = $(tds).get(3).firstChild.data.trim(), size = $(tds).get(9).firstChild.data, rlsDate = "notfound", genres = "notfound", runtime = "notfound", plot = "notfound", rating = "notfound", // of 10 imdb_id = "notfound", cover = "notfound", thumb = "notfound"; if (tds.length > 23) { rlsDate = $(tds).get(23).firstChild.data || ''; genres = $(tds).get(27).firstChild.data || ''; runtime = $(tds).get(31).firstChild.data || ''; if ($(tds).get(33).firstChild != null) { plot = $(tds).get(33).firstChild.data || ''; } rating = $('#imdb_rating').parent().next().text() || ''; // of 10 imdb_id = $('[name=imdbID]').get(0).attribs.value || ''; cover = $('#cover').children().eq(0).get(0).attribs.href || ''; thumb = $('[alt=Cover]').get(0).attribs.src || ''; if (typeof cover == 'undefined') { cover = thumb; } } this.results.push({ type: 'movie', imdb_id: imdb_id, title: title, year: rlsDate, genre: genres, rating: rating, runtime: runtime, image: thumb, cover: cover, synopsis: plot, torrents: { magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce', filesize: size } }); this.resultCount--; if (this.resultCount === 0) { this.callback({ results: this.results, hasMore: true }); } }, fne = function() { this.callback({ results: this.results, hasMore: true }); }; login(function(result) { console.log(result); }); 

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM