[英]How to write data in a node.js stream without duplicates?
This question is about a URL-crawler in node.js. 这个问题是关于node.js中的URL爬网程序。 On the
start_url
URL he looks for links and "pushes" them to a .json-file (output.json). 他在
start_url
URL上查找链接,并将其“推送”到.json文件(output.json)。
How can I make sure that he does not "push" or "write" domains twice to output.json (so that I do not get duplicates)? 如何确定他不会两次将“域”“推送”或“写入”两次到output.json(这样我就不会重复)? I've been using the hash function but this has caused problems.
我一直在使用哈希函数,但这已引起问题。
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var start_url = ["http://blog.codinghorror.com/"]
var wstream = fs.createWriteStream("output.json");
// Extract root domain name from string
function extractDomain(url) {
var domain;
if (url.indexOf("://") > -1) { //find & remove protocol (http(s), ftp, etc.) and get domain
domain = url.split('/')[2];
} else {
domain = url.split('/')[0];
}
domain = domain.split(':')[0]; //find & remove port number
return domain;
}
var req = function(url){
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
$("a").each(function() {
var link = $(this).attr("href");
var makelinkplain = extractDomain(link);
start_url.push("http://" + makelinkplain);
wstream.write('"http://'+ makelinkplain + '",');
});
}
start_url.shift();
if(start_url.length > 0) {
return req(start_url[0]);
}
wstream.end();
});
}
req(start_url[0]);
You can just keep track of the previously seen domains in a Set
object like this: 您可以像这样在
Set
对象中跟踪以前看到的域:
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var domainList = new Set();
var start_url = ["http://blog.codinghorror.com/"]
var wstream = fs.createWriteStream("output.json");
// Extract root domain name from string
function extractDomain(url) {
var domain;
if (url.indexOf("://") > -1) { //find & remove protocol (http(s), ftp, etc.) and get domain
domain = url.split('/')[2];
} else {
domain = url.split('/')[0];
}
domain = domain.split(':')[0]; //find & remove port number
// since domains are not case sensitive, canonicalize it by going to lowercase
return domain.toLowerCase();
}
var req = function(url){
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
$("a").each(function() {
var link = $(this).attr("href");
if (link) {
var makelinkplain = extractDomain(link);
// see if we've already done this domain
if (!domainList.has(makelinkplain)) {
domainList.add(makelinkplain);
start_url.push("http://" + makelinkplain);
wstream.write('"http://'+ makelinkplain + '",');
}
}
});
}
start_url.shift();
if(start_url.length > 0) {
return req(start_url[0]);
}
wstream.end();
});
}
req(start_url[0]);
Note: I also added a .toLowerCase()
to the extractDomain()
function since domains are not case sensitive, but a Set object is. 注意:我还向
extractDomain()
函数添加了.toLowerCase()
,因为域不区分大小写,而Set对象则区分大小写。 This will make sure that even domains that differ only in case are recognized as the same domain. 这样可以确保即使只有大小写不同的域也可以识别为同一域。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.