[英]NodeJS multiple requests
我正在编写一个网络爬虫,它根据如下所示的列表发出多个请求
1. Category1
1a. categoryItem1
1b. categoryItem2
2. Category2
2a. categoryItem1
2b. categoryItem2
2c. categoryItem3
3. Category3
3a. categoryItem1
Category
和categoryItem
都是链接。 一次只能扩展 1 个Category
。 Categories
和categoryItems
的数量可能会发生变化,所以我事先不知道确切的数量。
我正在收集每个categoryItem
页面上的数据以保存在如下所示的json
中
{
"Category1": [
"categoryItem1: {
// Details saved here
},
"categoryItem2: {
// Details saved here
}
],
"Category2": [
"categoryItem1: {
// Details saved here
},
"categoryItem2: {
// Details saved here
},
"categoryItem3: {
// Details saved here
}
],
"Category3": [
"categoryItem1: {
// Details saved here
}
]
}
我唯一剩下的就是弄清楚如何使这个行为同步
Category
列表categoryItem
详情页面如果您想知道, 这是我遵循的网络抓取工具教程。 由于异步调用,我不知道最后一页何时被解析,所以这里是脚本的结构
服务器.js
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', function (req, res) {
globalJSON = {};
baseUrl = 'http://...';
// 1.) open page with list
request.get(baseUrl, function (error, response, html) {
if (!error) {
var $ = cheerio.load(html);
// select the list
$('#categoryListSelector').filter(function () {
var data = $(this);
var listItem = data.find('#listItemSelector');
var expansionLink = listItem.find('a').attr('href'); // <a href=""></a>
var category = listItem.find('font').text();
// Save category to global json
globalJSON[category] = [];
// 2.) Expand the list by opening expansionLink
request.get(baseUrl + expansionLink, function (error, response, html) {
if (!error) {
var $ = cheerio.load(html);
// Select the sub items of each list item
$('#subItem selector').filter(function () {
var data = $(this);
var categoryItemPageLinkElement = data.find('a');
var categoryItemName = categoryItemPageLinkElement.text();
var categoryItemLink = $(categoryItemPageLinkElement).attr('href');
if (typeof categoryItemLink != "undefinded" && categoryItemLink != null && categoryItemLink != "") {
categoryItemObject = {}; // { categoryItemName: categoryItemDetails }
categoryItemDetails = {};
// 3.) Open the categoryItem page to start gathering data
request.get(baseUrl + categoryItemLink, function (error, response, html) {
if (!error) {
var $ = cheerio.load(html);
// GATHER and save data here
// Done gathering data save to global json
categoryItemObject[categoryItemName] = categoryItemDetails;
globalJSON[category].push(categoryItemObject);
}
});
}
});
}
});
});
fs.writeFile('output.json', JSON.stringify(globalJSON, null, 4), function (err) {
console.log('File successfully written!');
});
res.send(globalJSON);
}//END if(!error)
});
})//END app.get()
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
在下面的家伙的帮助下,我确实解决了我的问题,这就是我想出的。 现在,可能有更好的方法,请随时告诉我。
基本布局
Promise.all(categoriesArr.map(categoryObj => new Promise((resolve, reject)=>{
request.get(baseUrl + categoryObj.categoryItemLink, (error, response, html)=>{
if(error){
return reject(error);
}
//build an array of ALL the categoryItemLinks
return resolve(res, html);
});
}))).then(function(statesArray) {
Promise.all(allCategoryItems.map(categoryItemObject => new Promise((resolve, reject)=>{
request.get(baseUrl + categoryItemObject.categoryItemPageLink, (error, response, html)=>{
if(error){
return reject(error);
}
// Gather Data and put into dataJson
return resolve(response, html);
});
}))).then(function(data) {
// Do finishing stuff
}).catch(/*error*/);
}).catch(/*error*/);
服务器.js
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', function (req, res) {
categoriesArr = [];
allCategoryItems = [];
dataJson = {}; // Global json to hold all the data
baseUrl = 'http://www.blahblah.org';
request.get(baseUrl, function(error, response, html) {
if (!error) {
var $ = cheerio.load(html);
$('#categorySelector').filter(function() {
var data = $(this);
var categoryItemLink = data.find('a').attr('href');
categoriesArr.push({
"categoryItemLink": categoryItemLink
});
});
Promise.all(categoriesArr.map(categoryObj => new Promise((resolve, reject)=>{
request.get(baseUrl + categoryObj.categoryItemLink, (error, response, html)=>{
if(error){
return reject(error);
}
var $ = cheerio.load(html);
$('#categoryItemSelector').filter(function() {
var data = $(this);
var categoryItemPageLinkElement = data.find('a');
var categoryItemPageLink = $(categoryItemPageLinkElement).attr('href');
if(typeof categoryItemPageLink != "undefinded" && categoryItemPageLink != null && categoryItemPageLink != "") {
allCategoryItems.push({
"categoryItemPageLink": categoryItemPageLink
});
}
});
return resolve(res, html);
});
}))).then(function(statesArray) {
Promise.all(allCategoryItems.map(categoryItemObject => new Promise((resolve, reject)=>{
request.get(baseUrl + categoryItemObject.categoryItemPageLink, (error, response, html)=>{
if(error){
return reject(error);
}
var $ = cheerio.load(html);
// Gather Data and put into dataJson
return resolve(response, html);
});
}))).then(function(data) {
// Do finishing stuff
}).catch(/*error*/);
}).catch(/*error*/);
}//END if(!error)
});
})//END app.get()
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
您可以使用Promise.all()
,例如:
Promise.all(urls.map(url => new Promise((resolve, reject)=>{
request.get(url, (err, res, html)=>{
if(err){
return reject(err);
}
return resolve(res, html);
});
}))).then(/*success*/).catch(/*error*/);
在该代码中, .then()
在所有请求都返回响应后执行。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.