繁体   English   中英

使用节点Web刮板在DOM元素中循环时遇到麻烦

[英]Having trouble looping through DOM elements with node web scraper

我能够让刮板做我想做的事情,实际上有很多问题让刮板遍历我想要遍历的页面。 我认为我的问题可能与我的for循环的位置及其执行方式有关。

var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app     = express();
//set object to be populated by scraped DOM elements
var author, quote;
var json = { author : [], quote : []};

//Initialize the scraper the scraper url in the DOM
app.get('/scrape', function(req, res){
    //set the scraper url

这是这里的问题区域,我该如何设置它,使其不仅设置并循环播放最后一页,而是所有101页?

    for(var i = 1; i < 101; i++){
          url = 'http://www.goodreads.com/quotes?page=' + i;
    }

//

    request(url, function(error, response, html){
        if(!error){
            //use cheerio to use jquery to select DOM elements
            var $ = cheerio.load(html);

            //select DOM elements using jquery selectors
            $('.quoteText > a').filter(function(){
                var data = $(this);
                author = data.text();

                json.author.push(author);
                // all.push(data.text());
            })
            //select DOM elements using jquery selectors
            $('.quoteText').filter(function(){
                var data = $(this);
                quote = data.text();

                json.quote.push(quote);
            })
        }
        //loop through json object to clean up stings
        for(var i = 0; i < json.quote.length; i++) {
            //find the index of where the quote ends
            endQuote = json.quote[i].indexOf("―")
            //select only the part of the string that contains a quote
            json.quote[i] = json.quote[i].substring(0, endQuote - 1);
            //remove non breaking spaces from string
            json.quote[i] = json.quote[i].replace(/(\r\n|\n|\r)/gm,"");
        }
        //write the json file to folder 
        fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){
            console.log('File successfully written! - Check your project directory for the output.json file');
        })

        res.send('Check your console!')
    })
})

app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;

****编辑****

更改了代码以在函数调用结束时运行res.send('Check your console!') ,如果res被多次调用,应用将抛出错误。 还包括根据接受的答案进行的更改。

var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app     = express();
//set object to be populated by scraped DOM elements
var author, quote;
var json = { author : [], quote : []};
var url = []

//Initialize the scraper the scraper url in the DOM
app.get('/scrape', function(req, res){
    //set the scraper url
    for(var i = 1; i < 101; i++){
          url.push('http://www.goodreads.com/quotes?page=' + i);
    }

    for(i in url){
        request(url[i], function(error, response, html){
            if(!error){
                //use cheerio to use jquery to select DOM elements
                var $ = cheerio.load(html);

                //select DOM elements using jquery selectors
                $('.quoteText > a').filter(function(){
                    var data = $(this);
                    author = data.text();

                    json.author.push(author);
                    // all.push(data.text());
                })
                //select DOM elements using jquery selectors
                $('.quoteText').filter(function(){
                    var data = $(this);
                    quote = data.text();

                    json.quote.push(quote);
                })
            }
        })
    }

    res.send('Check your console!')
})

function cleanUp(){
    //loop through json object to clean up stings
    for(var i = 0; i < json.quote.length; i++) {
        //find the index of where the quote ends
        endQuote = json.quote[i].indexOf("―")
        //select only the part of the string that contains a quote
        json.quote[i] = json.quote[i].substring(0, endQuote - 1);
        //remove non breaking spaces from string
        json.quote[i] = json.quote[i].replace(/(\r\n|\n|\r)/gm,"");
    }
    //write the json file to folder 
    fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){
        console.log('File successfully written! - Check your project directory for the output.json file');
    })
}


app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;

在您提供的示例代码中:

for(var i = 1; i < 101; i++){
      url = 'http://www.goodreads.com/quotes?page=' + i;
}

每次循环时,for循环都会覆盖url变量。

您可以对代码进行一些小的更改即可使其工作; 最简单的方法是将url设置为数组,然后每次通过循环将其压入该数组,以便url列表像下面的代码一样继续积累:

var url = [];
for(var i = 1; i < 101; i++){
    url.push('http://www.goodreads.com/quotes?page=' + i);
}

然后,您将需要为数组中的每个项目调用请求函数,因为url现在包含一个包含100个项目的数组,并且还将fs.writeFile调用更改为fs.appendFile因此每个request调用的结果都将添加到输出中.json文件而不是覆盖它。

最后,您还应该考虑限制请求,以免影响正在抓取的站点的服务器。

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM