![](/img/trans.png)
[英]Looping through DOM elements and children to create a JS Array - having trouble understanding the best way to accomplish this
[英]Having trouble looping through DOM elements with node web scraper
我能够让刮板做我想做的事情,实际上有很多问题让刮板遍历我想要遍历的页面。 我认为我的问题可能与我的for循环的位置及其执行方式有关。
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
//set object to be populated by scraped DOM elements
var author, quote;
var json = { author : [], quote : []};
//Initialize the scraper the scraper url in the DOM
app.get('/scrape', function(req, res){
//set the scraper url
这是这里的问题区域,我该如何设置它,使其不仅设置并循环播放最后一页,而是所有101页?
for(var i = 1; i < 101; i++){
url = 'http://www.goodreads.com/quotes?page=' + i;
}
//
request(url, function(error, response, html){
if(!error){
//use cheerio to use jquery to select DOM elements
var $ = cheerio.load(html);
//select DOM elements using jquery selectors
$('.quoteText > a').filter(function(){
var data = $(this);
author = data.text();
json.author.push(author);
// all.push(data.text());
})
//select DOM elements using jquery selectors
$('.quoteText').filter(function(){
var data = $(this);
quote = data.text();
json.quote.push(quote);
})
}
//loop through json object to clean up stings
for(var i = 0; i < json.quote.length; i++) {
//find the index of where the quote ends
endQuote = json.quote[i].indexOf("―")
//select only the part of the string that contains a quote
json.quote[i] = json.quote[i].substring(0, endQuote - 1);
//remove non breaking spaces from string
json.quote[i] = json.quote[i].replace(/(\r\n|\n|\r)/gm,"");
}
//write the json file to folder
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');
})
res.send('Check your console!')
})
})
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
****编辑****
更改了代码以在函数调用结束时运行res.send('Check your console!')
,如果res被多次调用,应用将抛出错误。 还包括根据接受的答案进行的更改。
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
//set object to be populated by scraped DOM elements
var author, quote;
var json = { author : [], quote : []};
var url = []
//Initialize the scraper the scraper url in the DOM
app.get('/scrape', function(req, res){
//set the scraper url
for(var i = 1; i < 101; i++){
url.push('http://www.goodreads.com/quotes?page=' + i);
}
for(i in url){
request(url[i], function(error, response, html){
if(!error){
//use cheerio to use jquery to select DOM elements
var $ = cheerio.load(html);
//select DOM elements using jquery selectors
$('.quoteText > a').filter(function(){
var data = $(this);
author = data.text();
json.author.push(author);
// all.push(data.text());
})
//select DOM elements using jquery selectors
$('.quoteText').filter(function(){
var data = $(this);
quote = data.text();
json.quote.push(quote);
})
}
})
}
res.send('Check your console!')
})
function cleanUp(){
//loop through json object to clean up stings
for(var i = 0; i < json.quote.length; i++) {
//find the index of where the quote ends
endQuote = json.quote[i].indexOf("―")
//select only the part of the string that contains a quote
json.quote[i] = json.quote[i].substring(0, endQuote - 1);
//remove non breaking spaces from string
json.quote[i] = json.quote[i].replace(/(\r\n|\n|\r)/gm,"");
}
//write the json file to folder
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');
})
}
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
在您提供的示例代码中:
for(var i = 1; i < 101; i++){
url = 'http://www.goodreads.com/quotes?page=' + i;
}
每次循环时,for循环都会覆盖url变量。
您可以对代码进行一些小的更改即可使其工作; 最简单的方法是将url设置为数组,然后每次通过循环将其压入该数组,以便url列表像下面的代码一样继续积累:
var url = [];
for(var i = 1; i < 101; i++){
url.push('http://www.goodreads.com/quotes?page=' + i);
}
然后,您将需要为数组中的每个项目调用请求函数,因为url现在包含一个包含100个项目的数组,并且还将fs.writeFile
调用更改为fs.appendFile
因此每个request
调用的结果都将添加到输出中.json文件而不是覆盖它。
最后,您还应该考虑限制请求,以免影响正在抓取的站点的服务器。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.