[英]Trouble using nested callbacks in NodeJS
我正在编写一个程序,该程序将站点刮取为链接,然后对这些链接进行刮取以获取信息。 为了抓取该站点,必须先登录。 顺序如下:登录->删除链接索引->删除链接以获取信息
登录函数的回调会输出一个空数组{ results: [], hasMore: true }
,所以我的代码有问题(抓取部分有效):
var request = require('request');
var request = request.defaults({jar: true}); // necessary for persistent login
var cheerio = require('cheerio');
var url1 = "https://example.org/torrents/browse/index/";
var loginUrl = "https://example.org/user/account/login/";
var credentials = {
username: 'user1',
password: 'passpass'
};
login(function (result) {
console.log(result);
});
function login(callback) {
request.post({
uri: loginUrl,
headers: { 'content-type': 'application/x-www-form-urlencoded' },
body: require('querystring').stringify(credentials)
}, function(err, res, body){
if(err) {
console.log("Login error");
return;
}
scrapeTorrents(url1, function (result) {
callback(result);
});
});
}
function scrapeTorrents(url, callback) {
request(url, function(err, res, body) {
if(err) {
console.log("Main scrape error");
return;
}
var links = []
var $ = cheerio.load(body);
$('span.title').each(function(i, element){
var title = $(this);
var a = $(this).children().eq(0);
var detailsUrl = a.attr('href');
//console.log(detailsUrl);
links.push(detailsUrl);
});
scrapeTorrentDetails(links, function (result) {
callback(result);
});
});
}
function scrapeTorrentDetails(links, callback) {
var results = [];
function getDetails(url) {
request(url, function(err, res, body) {
if(err) {
console.log("Detail scrape error");
return;
}
console.log("Scraping: " + url);
var $ = cheerio.load(body);
var tds = $('td');
var title = $(tds).get(1).firstChild.data;
var hash = $(tds).get(3).firstChild.data.trim();
var size = $(tds).get(9).firstChild.data;
// console.log(tds.length);
if (tds.length > 23) {
var rlsDate = $(tds).get(23).firstChild.data || '';;
var genres = $(tds).get(27).firstChild.data || '';;
var runtime = $(tds).get(31).firstChild.data || '';;
if ( $(tds).get(33).firstChild != null) {
var plot = $(tds).get(33).firstChild.data || '';;
}
var rating = $('#imdb_rating').parent().next().text() || '';; // of 10
var imdb_id = $('[name=imdbID]').get(0).attribs.value || '';;
var cover = $('#cover').children().eq(0).get(0).attribs.href || '';;
var thumb = $('[alt=Cover]').get(0).attribs.src || '';;
if (typeof cover == 'undefined') {
cover = thumb;
}
} else {
var rlsDate = "notfound";
var genres = "notfound";
var runtime = "notfound";
var plot = "notfound";
var rating = "notfound"; // of 10
var imdb_id = "notfound";
var cover = "notfound";
var thumb = "notfound";
}
var movie = {
type: 'movie',
imdb_id: imdb_id,
title: title,
year: rlsDate,
genre: genres,
rating: rating,
runtime: runtime,
image: thumb,
cover: cover,
synopsis: plot,
torrents: {
magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
filesize: size
}
};
results.push(movie);
});
}
for (var i=0; i<links.length; i++){
getDetails("https://example.org" + links[i]);
}
callback( {
results: results,
hasMore: true
});
}
也许Q承诺会更好。 如何在上面的代码中实现呢?
如果您想知道代码的用途,我打算修改Popcorn-time以使用另一个torrent-tracker(不带API)。
谢谢
一个主要问题是与此代码:
for (var i=0; i<links.length; i++){
getDetails("https://example.org" + links[i]);
}
callback( {
results: results,
hasMore: true
});
getDetails()
是异步的,但是您只需将其links.length
次并继续前进即可-就像它们都已完成一样。 因此,在调用回调并尝试传递结果之前, getDetails()
中的所有请求getDetails()
完成。 但是,尚未填写任何结果,因此它们将为空。
您的代码中到处都有所有其他嵌套的回调(根据需要),但是您将球放在了这个地方。 您需要知道何时完成所有getDetails()
调用,然后再调用带有结果的最终回调。
此外,您还必须确定是否可以并行调用所有getDetails()
调用(一次都在运行中),或者您真正想做的是调用一个,等待其完成,然后调用接下来,等等...现在,如果目标服务器不一次拒绝那么多请求,那么您可以一次将它们全部进行传输。
有几种解决此问题的潜在策略。
在getDetails()
添加一个回调,然后保留从getDetails()
获得links.length
回调的计数,并且仅在整个计数完成后才进行计数,因此您可以调用最终的回调。
更改getDetails()
以返回承诺。 然后,您可以使用类似links.map(getDetails)
方法创建一个links.map(getDetails)
数组,然后可以使用Promise.all()
来了解它们何时完成。
就个人而言,我会更改您的所有代码以使用Promise.map()
并使用Bluebird Promise.map()
库,因为它具有Promise.map()
这样的额外功能,可以使此操作更加简单。
这是一个向getDetails()
添加回调然后计算完成次数的修复程序:
function scrapeTorrentDetails(links, callback) {
var results = [];
function getDetails(url, done) {
request(url, function(err, res, body) {
if(err) {
console.log("Detail scrape error");
done(err);
return;
}
console.log("Scraping: " + url);
var $ = cheerio.load(body);
var tds = $('td');
var title = $(tds).get(1).firstChild.data;
var hash = $(tds).get(3).firstChild.data.trim();
var size = $(tds).get(9).firstChild.data;
// console.log(tds.length);
if (tds.length > 23) {
var rlsDate = $(tds).get(23).firstChild.data || '';;
var genres = $(tds).get(27).firstChild.data || '';;
var runtime = $(tds).get(31).firstChild.data || '';;
if ( $(tds).get(33).firstChild != null) {
var plot = $(tds).get(33).firstChild.data || '';;
}
var rating = $('#imdb_rating').parent().next().text() || '';; // of 10
var imdb_id = $('[name=imdbID]').get(0).attribs.value || '';;
var cover = $('#cover').children().eq(0).get(0).attribs.href || '';;
var thumb = $('[alt=Cover]').get(0).attribs.src || '';;
if (typeof cover == 'undefined') {
cover = thumb;
}
} else {
var rlsDate = "notfound";
var genres = "notfound";
var runtime = "notfound";
var plot = "notfound";
var rating = "notfound"; // of 10
var imdb_id = "notfound";
var cover = "notfound";
var thumb = "notfound";
}
var movie = {
type: 'movie',
imdb_id: imdb_id,
title: title,
year: rlsDate,
genre: genres,
rating: rating,
runtime: runtime,
image: thumb,
cover: cover,
synopsis: plot,
torrents: {
magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
filesize: size
}
};
results.push(movie);
done();
});
}
var doneCnt = 0;
for (var i=0; i<links.length; i++){
getDetails("https://example.org" + links[i], function() {
++doneCnt;
if (doneCnt === links.length) {
callback( {
results: results,
hasMore: true
});
}
});
}
}
以下是重写的给定示例代码,以使用bind,自定义此对象以及尚未完成的请求计数(我认为承诺掩盖了执行路径)。
回调返回空数组的原因似乎是文档中没有带有title属性的跨度,因此没有触发更多请求。
var request = require('request').defaults({ jar: true }), // necessary for persistent login cheerio = require('cheerio'), process = require('process'), url1 = "https://example.org/torrents/browse/index/", loginUrl = "https://example.org/user/account/login/", login = function(callback) { request.post({ uri: loginUrl, headers: { 'content-type': 'application/x-www-form-urlencoded' }, body: require('querystring').stringify({ username: 'user1', password: 'passpass' }) }, fna.bind({ callback: callback })); }, fna = function(err, res, body) { if (err) { console.log("Login error"); return; } request(url1, fnb.bind(this)); }, fnb = function(err, res, body) { if (err) { console.log("Main scrape error"); return; } var $ = cheerio.load(body), links = [], fnd = fne.bind(this); $('span.title').each(function() { links.push($(this).children().first().attr('href')); }); this.results = []; this.resultCount = links.length; if (this.resultCount) { fnd = fnc.bind(this); for (var i = 0; i < links.length; i++) { request("https://example.org" + links[i], fnd); } } else { process.nextTick(fnd); } }, fnc = function(err, res, body) { if (err) { console.log("Detail scrape error"); return; } console.log("Scraping: " + url); var $ = cheerio.load(body), tds = $('td'), title = $(tds).get(1).firstChild.data, hash = $(tds).get(3).firstChild.data.trim(), size = $(tds).get(9).firstChild.data, rlsDate = "notfound", genres = "notfound", runtime = "notfound", plot = "notfound", rating = "notfound", // of 10 imdb_id = "notfound", cover = "notfound", thumb = "notfound"; if (tds.length > 23) { rlsDate = $(tds).get(23).firstChild.data || ''; genres = $(tds).get(27).firstChild.data || ''; runtime = $(tds).get(31).firstChild.data || ''; if ($(tds).get(33).firstChild != null) { plot = $(tds).get(33).firstChild.data || ''; } rating = $('#imdb_rating').parent().next().text() || ''; // of 10 imdb_id = $('[name=imdbID]').get(0).attribs.value || ''; cover = $('#cover').children().eq(0).get(0).attribs.href || ''; thumb = $('[alt=Cover]').get(0).attribs.src || ''; if (typeof cover == 'undefined') { cover = thumb; } } this.results.push({ type: 'movie', imdb_id: imdb_id, title: title, year: rlsDate, genre: genres, rating: rating, runtime: runtime, image: thumb, cover: cover, synopsis: plot, torrents: { magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce', filesize: size } }); this.resultCount--; if (this.resultCount === 0) { this.callback({ results: this.results, hasMore: true }); } }, fne = function() { this.callback({ results: this.results, hasMore: true }); }; login(function(result) { console.log(result); });
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.