简体   繁体   English

在NodeJS中使用嵌套回调很麻烦

[英]Trouble using nested callbacks in NodeJS

I'm writing a program that scrapes a site for links, then scrapes these links for information. 我正在编写一个程序,该程序将站点刮取为链接,然后对这些链接进行刮取以获取信息。 In order to scrape the site, it is necessary to log in first. 为了抓取该站点,必须先登录。 And so the order is: Log in -> Scrape the index for links -> Scrape the links for info 顺序如下:登录->删除链接索引->删除链接以获取信息

The callback to the login function prints an empty array { results: [], hasMore: true } , so something is wrong with my code (the scraping part works): 登录函数的回调会输出一个空数组{ results: [], hasMore: true } ,所以我的代码有问题(抓取部分有效):

var request = require('request');
var request = request.defaults({jar: true}); // necessary for persistent login
var cheerio = require('cheerio');

var url1 = "https://example.org/torrents/browse/index/";
var loginUrl = "https://example.org/user/account/login/";

var credentials = {
    username: 'user1',
    password: 'passpass'
};

login(function (result) {
    console.log(result);
});

function login(callback) {
    request.post({
        uri: loginUrl,
        headers: { 'content-type': 'application/x-www-form-urlencoded' },
        body: require('querystring').stringify(credentials)
    }, function(err, res, body){
        if(err) {
            console.log("Login error");
            return;
        }
        scrapeTorrents(url1, function (result) {
            callback(result);
        });
    });
}

function scrapeTorrents(url, callback) {
    request(url, function(err, res, body) {
        if(err) {
            console.log("Main scrape error");
            return;
        }
        var links = []
        var $ = cheerio.load(body);
        $('span.title').each(function(i, element){
            var title = $(this);
            var a = $(this).children().eq(0);
            var detailsUrl = a.attr('href');
            //console.log(detailsUrl);
            links.push(detailsUrl);
        });
         scrapeTorrentDetails(links, function (result) {
             callback(result);
         });
    });
}

function scrapeTorrentDetails(links, callback) {
    var results = [];

    function getDetails(url) {
        request(url, function(err, res, body) {
                if(err) {
                    console.log("Detail scrape error");
                    return;
                }
                console.log("Scraping: " + url);
                var $ = cheerio.load(body);
                var tds = $('td');
                var title = $(tds).get(1).firstChild.data;
                var hash = $(tds).get(3).firstChild.data.trim();
                var size = $(tds).get(9).firstChild.data;
                //  console.log(tds.length);
                if (tds.length > 23) {
                    var rlsDate = $(tds).get(23).firstChild.data || '';;
                    var genres = $(tds).get(27).firstChild.data || '';;
                    var runtime = $(tds).get(31).firstChild.data || '';;
                    if ( $(tds).get(33).firstChild != null) {
                        var plot = $(tds).get(33).firstChild.data || '';;
                    }
                    var rating = $('#imdb_rating').parent().next().text() || '';; // of 10
                    var imdb_id = $('[name=imdbID]').get(0).attribs.value || '';;
                    var cover = $('#cover').children().eq(0).get(0).attribs.href || '';;
                    var thumb = $('[alt=Cover]').get(0).attribs.src || '';;
                    if (typeof cover == 'undefined') {
                        cover = thumb;
                    }
                } else {
                    var rlsDate = "notfound";
                    var genres = "notfound";
                    var runtime = "notfound";
                    var plot = "notfound";
                    var rating = "notfound"; // of 10
                    var imdb_id = "notfound";
                    var cover = "notfound";
                    var thumb = "notfound";
                }

                var movie = {
                    type: 'movie',
                    imdb_id: imdb_id,
                    title: title,
                    year: rlsDate,
                    genre: genres,
                    rating: rating,
                    runtime: runtime,
                    image: thumb,
                    cover: cover,
                    synopsis: plot,
                    torrents: {
                        magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
                        filesize: size
                    }
                };

                results.push(movie);
            });
    }

    for (var i=0; i<links.length; i++){
            getDetails("https://example.org" + links[i]);
    }

    callback( {
        results: results,
        hasMore: true
    });
}

Maybe Q promises would be better. 也许Q承诺会更好。 How would I implement that in the code above? 如何在上面的代码中实现呢?

If you're wondering what the code is for, I'm planning to modify Popcorn-time to use another torrent-tracker (without an API). 如果您想知道代码的用途,我打算修改Popcorn-time以使用另一个torrent-tracker(不带API)。

Thanks 谢谢

A main problem is with this code: 一个主要问题是与此代码:

for (var i=0; i<links.length; i++){
        getDetails("https://example.org" + links[i]);
}

callback( {
    results: results,
    hasMore: true
});

getDetails() is async, but you just call it links.length times and move on - acting like they have all completed. getDetails()是异步的,但是您只需将其links.length次并继续前进即可-就像它们都已完成一样。 So, none of the requests in getDetails() is done before you call the callback and try to pass the results. 因此,在调用回调并尝试传递结果之前, getDetails()中的所有请求getDetails()完成。 But, none of the results have yet been filled in so they will be empty. 但是,尚未填写任何结果,因此它们将为空。

You have all these other nested callbacks everywhere through your code (as required), yet you dropped the ball in this one place. 您的代码中到处都有所有其他嵌套的回调(根据需要),但是您将球放在了这个地方。 You need to know when all the getDetails() calls are done before you call the final callback with the results. 您需要知道何时完成所有getDetails()调用,然后再调用带有结果的最终回调。

In addition, you also have to decide if you're OK calling all the getDetails() calls in parallel (all in flight at once) or if what you really want to do is to call one, wait for it to finish, then call the next, etc... Right now you are putting them all in-flight at once which can work if the destination server doesn't object to that many requests all at once. 此外,您还必须确定是否可以并行调用所有getDetails()调用(一次都在运行中),或者您真正想做的是调用一个,等待其完成,然后调用接下来,等等...现在,如果目标服务器不一次拒绝那么多请求,那么您可以一次将它们全部进行传输。


There are several potential strategies for fixing this. 有几种解决此问题的潜在策略。

  1. Add a callback to getDetails() and then keep a count of when you've gotten links.length callbacks from getDetails() and only when the entire count has finished so you call the final callback. getDetails()添加一个回调,然后保留从getDetails()获得links.length回调的计数,并且仅在整个计数完成后才进行计数,因此您可以调用最终的回调。

  2. Change getDetails() to return a promise. 更改getDetails()以返回承诺。 You can then use something like links.map(getDetails) to create an array of promises that you can then use Promise.all() with to know when they are all done. 然后,您可以使用类似links.map(getDetails)方法创建一个links.map(getDetails)数组,然后可以使用Promise.all()来了解它们何时完成。

Personally, I would change all of your code to use promises and I'd use the Bluebird promises library for it's extra features such as Promise.map() to make this even simpler. 就个人而言,我会更改您的所有代码以使用Promise.map()并使用Bluebird Promise.map()库,因为它具有Promise.map()这样的额外功能,可以使此操作更加简单。

Here's a fix that adds a callback to getDetails() and then counts how many are done: 这是一个向getDetails()添加回调然后计算完成次数的修复程序:

function scrapeTorrentDetails(links, callback) {
    var results = [];

    function getDetails(url, done) {
        request(url, function(err, res, body) {
                if(err) {
                    console.log("Detail scrape error");
                    done(err);
                    return;
                }
                console.log("Scraping: " + url);
                var $ = cheerio.load(body);
                var tds = $('td');
                var title = $(tds).get(1).firstChild.data;
                var hash = $(tds).get(3).firstChild.data.trim();
                var size = $(tds).get(9).firstChild.data;
                //  console.log(tds.length);
                if (tds.length > 23) {
                    var rlsDate = $(tds).get(23).firstChild.data || '';;
                    var genres = $(tds).get(27).firstChild.data || '';;
                    var runtime = $(tds).get(31).firstChild.data || '';;
                    if ( $(tds).get(33).firstChild != null) {
                        var plot = $(tds).get(33).firstChild.data || '';;
                    }
                    var rating = $('#imdb_rating').parent().next().text() || '';; // of 10
                    var imdb_id = $('[name=imdbID]').get(0).attribs.value || '';;
                    var cover = $('#cover').children().eq(0).get(0).attribs.href || '';;
                    var thumb = $('[alt=Cover]').get(0).attribs.src || '';;
                    if (typeof cover == 'undefined') {
                        cover = thumb;
                    }
                } else {
                    var rlsDate = "notfound";
                    var genres = "notfound";
                    var runtime = "notfound";
                    var plot = "notfound";
                    var rating = "notfound"; // of 10
                    var imdb_id = "notfound";
                    var cover = "notfound";
                    var thumb = "notfound";
                }

                var movie = {
                    type: 'movie',
                    imdb_id: imdb_id,
                    title: title,
                    year: rlsDate,
                    genre: genres,
                    rating: rating,
                    runtime: runtime,
                    image: thumb,
                    cover: cover,
                    synopsis: plot,
                    torrents: {
                        magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
                        filesize: size
                    }
                };

                results.push(movie);
                done();
            });
    }

    var doneCnt = 0;
    for (var i=0; i<links.length; i++){
        getDetails("https://example.org" + links[i], function() {
            ++doneCnt;
            if (doneCnt === links.length) {
                callback( {
                    results: results,
                    hasMore: true
                });
            }
        });
    }

}

The following is the given sample code rewritten to use bind, a custom this object and a count of the requests that have yet to complete (I think promises obscure the execution path). 以下是重写的给定示例代码,以使用bind,自定义此对象以及尚未完成的请求计数(我认为承诺掩盖了执行路径)。

The reason that the callback is returning an empty array seems to be that there are no spans in the document with a title attribute, so as a result no further requests are triggered. 回调返回空数组的原因似乎是文档中没有带有title属性的跨度,因此没有触发更多请求。

 var request = require('request').defaults({ jar: true }), // necessary for persistent login cheerio = require('cheerio'), process = require('process'), url1 = "https://example.org/torrents/browse/index/", loginUrl = "https://example.org/user/account/login/", login = function(callback) { request.post({ uri: loginUrl, headers: { 'content-type': 'application/x-www-form-urlencoded' }, body: require('querystring').stringify({ username: 'user1', password: 'passpass' }) }, fna.bind({ callback: callback })); }, fna = function(err, res, body) { if (err) { console.log("Login error"); return; } request(url1, fnb.bind(this)); }, fnb = function(err, res, body) { if (err) { console.log("Main scrape error"); return; } var $ = cheerio.load(body), links = [], fnd = fne.bind(this); $('span.title').each(function() { links.push($(this).children().first().attr('href')); }); this.results = []; this.resultCount = links.length; if (this.resultCount) { fnd = fnc.bind(this); for (var i = 0; i < links.length; i++) { request("https://example.org" + links[i], fnd); } } else { process.nextTick(fnd); } }, fnc = function(err, res, body) { if (err) { console.log("Detail scrape error"); return; } console.log("Scraping: " + url); var $ = cheerio.load(body), tds = $('td'), title = $(tds).get(1).firstChild.data, hash = $(tds).get(3).firstChild.data.trim(), size = $(tds).get(9).firstChild.data, rlsDate = "notfound", genres = "notfound", runtime = "notfound", plot = "notfound", rating = "notfound", // of 10 imdb_id = "notfound", cover = "notfound", thumb = "notfound"; if (tds.length > 23) { rlsDate = $(tds).get(23).firstChild.data || ''; genres = $(tds).get(27).firstChild.data || ''; runtime = $(tds).get(31).firstChild.data || ''; if ($(tds).get(33).firstChild != null) { plot = $(tds).get(33).firstChild.data || ''; } rating = $('#imdb_rating').parent().next().text() || ''; // of 10 imdb_id = $('[name=imdbID]').get(0).attribs.value || ''; cover = $('#cover').children().eq(0).get(0).attribs.href || ''; thumb = $('[alt=Cover]').get(0).attribs.src || ''; if (typeof cover == 'undefined') { cover = thumb; } } this.results.push({ type: 'movie', imdb_id: imdb_id, title: title, year: rlsDate, genre: genres, rating: rating, runtime: runtime, image: thumb, cover: cover, synopsis: plot, torrents: { magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce', filesize: size } }); this.resultCount--; if (this.resultCount === 0) { this.callback({ results: this.results, hasMore: true }); } }, fne = function() { this.callback({ results: this.results, hasMore: true }); }; login(function(result) { console.log(result); }); 

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM