繁体   English   中英

NodeJS 多个请求

[英]NodeJS multiple requests

我正在编写一个网络爬虫,它根据如下所示的列表发出多个请求

 1. Category1
    1a. categoryItem1
    1b. categoryItem2
 2. Category2
    2a. categoryItem1
    2b. categoryItem2
    2c. categoryItem3
 3. Category3
    3a. categoryItem1

CategorycategoryItem都是链接。 一次只能扩展 1 个Category CategoriescategoryItems的数量可能会发生变化,所以我事先不知道确切的数量。

我正在收集每个categoryItem页面上的数据以保存在如下所示的json

{
    "Category1": [
        "categoryItem1: {
            // Details saved here
        },
        "categoryItem2: {
            // Details saved here
        }
    ],
    "Category2": [
        "categoryItem1: {
            // Details saved here
        },
        "categoryItem2: {
            // Details saved here
        },
        "categoryItem3: {
            // Details saved here
        }
    ],
    "Category3": [
        "categoryItem1: {
            // Details saved here
        }
    ]
}

我唯一剩下的就是弄清楚如何使这个行为同步

  1. 获取打开页面
  2. 打开每个Category列表
  3. 打开每个categoryItem详情页面

如果您想知道, 是我遵循的网络抓取工具教程。 由于异步调用,我不知道最后一页何时被解析,所以这里是脚本的结构

服务器.js

var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();

app.get('/scrape', function (req, res) {

    globalJSON = {};

    baseUrl = 'http://...';

    // 1.) open page with list
    request.get(baseUrl, function (error, response, html) {
        if (!error) {

            var $ = cheerio.load(html);

            // select the list
            $('#categoryListSelector').filter(function () {
                var data = $(this);

                var listItem = data.find('#listItemSelector');

                var expansionLink = listItem.find('a').attr('href'); // <a href=""></a>
                var category = listItem.find('font').text();

                // Save category to global json
                globalJSON[category] = [];

                // 2.) Expand the list by opening expansionLink
                request.get(baseUrl + expansionLink, function (error, response, html) {
                    if (!error) {
                        var $ = cheerio.load(html);

                        // Select the sub items of each list item
                        $('#subItem selector').filter(function () {
                            var data = $(this);

                            var categoryItemPageLinkElement = data.find('a');

                            var categoryItemName = categoryItemPageLinkElement.text();
                            var categoryItemLink = $(categoryItemPageLinkElement).attr('href');

                            if (typeof categoryItemLink != "undefinded" && categoryItemLink != null && categoryItemLink != "") {

                                categoryItemObject = {}; // { categoryItemName: categoryItemDetails }
                                categoryItemDetails = {};

                                // 3.) Open the categoryItem page to start gathering data
                                request.get(baseUrl + categoryItemLink, function (error, response, html) {
                                    if (!error) {
                                        var $ = cheerio.load(html);

                                        // GATHER and save data here

                                        // Done gathering data save to global json
                                        categoryItemObject[categoryItemName] = categoryItemDetails;
                                        globalJSON[category].push(categoryItemObject);

                                    }
                                });
                            }
                        });
                    }
                });
            });

            fs.writeFile('output.json', JSON.stringify(globalJSON, null, 4), function (err) {
                console.log('File successfully written!');
            });
            res.send(globalJSON);

        }//END if(!error)
    });

})//END app.get()

app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;

更新

在下面的家伙的帮助下,我确实解决了我的问题,这就是我想出的。 现在,可能有更好的方法,请随时告诉我。

基本布局

Promise.all(categoriesArr.map(categoryObj => new Promise((resolve, reject)=>{
        request.get(baseUrl + categoryObj.categoryItemLink, (error, response, html)=>{
            if(error){
                return reject(error);
            }

            //build an array of ALL the categoryItemLinks

            return resolve(res, html);
        });
}))).then(function(statesArray) {

        Promise.all(allCategoryItems.map(categoryItemObject => new Promise((resolve, reject)=>{
            request.get(baseUrl + categoryItemObject.categoryItemPageLink, (error, response, html)=>{
                if(error){
                    return reject(error);
                }
                // Gather Data and put into dataJson

                return resolve(response, html);
            });
        }))).then(function(data) {

            // Do finishing stuff

        }).catch(/*error*/);

}).catch(/*error*/);

服务器.js

var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();

app.get('/scrape', function (req, res) {

    categoriesArr = [];
    allCategoryItems = [];

    dataJson = {}; // Global json to hold all the data

    baseUrl = 'http://www.blahblah.org';

    request.get(baseUrl, function(error, response, html) {
            if (!error) {

                var $ = cheerio.load(html);

                $('#categorySelector').filter(function() {
                    var data = $(this);

                    var categoryItemLink = data.find('a').attr('href');

                    categoriesArr.push({
                        "categoryItemLink": categoryItemLink
                    });

                });

                Promise.all(categoriesArr.map(categoryObj => new Promise((resolve, reject)=>{
                    request.get(baseUrl + categoryObj.categoryItemLink, (error, response, html)=>{
                        if(error){
                            return reject(error);
                        }

                        var $ = cheerio.load(html);

                        $('#categoryItemSelector').filter(function() {
                            var data = $(this);
                            var categoryItemPageLinkElement = data.find('a');
                            var categoryItemPageLink = $(categoryItemPageLinkElement).attr('href');

                            if(typeof categoryItemPageLink != "undefinded" && categoryItemPageLink != null && categoryItemPageLink != "") {

                                allCategoryItems.push({
                                    "categoryItemPageLink": categoryItemPageLink
                                });

                            }
                        });

                        return resolve(res, html);
                    });
                }))).then(function(statesArray) {

                    Promise.all(allCategoryItems.map(categoryItemObject => new Promise((resolve, reject)=>{
                        request.get(baseUrl + categoryItemObject.categoryItemPageLink, (error, response, html)=>{
                            if(error){
                                return reject(error);
                            }
                            var $ = cheerio.load(html);
                            // Gather Data and put into dataJson

                            return resolve(response, html);
                        });
                    }))).then(function(data) {

                        // Do finishing stuff

                    }).catch(/*error*/);

                }).catch(/*error*/);

            }//END if(!error)
    });

})//END app.get()

app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;

您可以使用Promise.all() ,例如:

Promise.all(urls.map(url => new Promise((resolve, reject)=>{
    request.get(url, (err, res, html)=>{
        if(err){
            return reject(err);
        }
        return resolve(res, html);
    });
}))).then(/*success*/).catch(/*error*/);

在该代码中, .then()在所有请求都返回响应后执行。

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM