如何在没有重复的node.js流中写入数据？

Question

This question is about a URL-crawler in node.js. 这个问题是关于node.js中的URL爬网程序。 On the start_url URL he looks for links and "pushes" them to a .json-file (output.json). 他在start_url URL上查找链接，并将其“推送”到.json文件（output.json）。

How can I make sure that he does not "push" or "write" domains twice to output.json (so that I do not get duplicates)? 如何确定他不会两次将“域”“推送”或“写入”两次到output.json（这样我就不会重复）？ I've been using the hash function but this has caused problems. 我一直在使用哈希函数，但这已引起问题。

var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');

var start_url = ["http://blog.codinghorror.com/"]
var wstream = fs.createWriteStream("output.json");

// Extract root domain name from string
function extractDomain(url) {
    var domain;
    if (url.indexOf("://") > -1) { //find & remove protocol (http(s), ftp, etc.) and get domain
        domain = url.split('/')[2];
    } else {
        domain = url.split('/')[0];
    }
    domain = domain.split(':')[0]; //find & remove port number
    return domain;
}

var req = function(url){
    request(url, function(error, response, html){
      if(!error){
        var $ = cheerio.load(html);

        $("a").each(function() {
            var link = $(this).attr("href");
            var makelinkplain = extractDomain(link);

            start_url.push("http://" + makelinkplain);
            wstream.write('"http://'+ makelinkplain + '",');
        });
      }

        start_url.shift();

        if(start_url.length > 0) {
          return req(start_url[0]);
        }

          wstream.end();
    });
}

req(start_url[0]);

Answer 1

You can just keep track of the previously seen domains in a Set object like this: 您可以像这样在Set对象中跟踪以前看到的域：

var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');

var domainList = new Set();
var start_url = ["http://blog.codinghorror.com/"]
var wstream = fs.createWriteStream("output.json");

// Extract root domain name from string
function extractDomain(url) {
    var domain;
    if (url.indexOf("://") > -1) { //find & remove protocol (http(s), ftp, etc.) and get domain
        domain = url.split('/')[2];
    } else {
        domain = url.split('/')[0];
    }
    domain = domain.split(':')[0]; //find & remove port number
    // since domains are not case sensitive, canonicalize it by going to lowercase
    return domain.toLowerCase();
}

var req = function(url){
    request(url, function(error, response, html){
      if(!error){
        var $ = cheerio.load(html);

        $("a").each(function() {
            var link = $(this).attr("href");
            if (link) {
                var makelinkplain = extractDomain(link);
                // see if we've already done this domain
                if (!domainList.has(makelinkplain)) {
                    domainList.add(makelinkplain);
                    start_url.push("http://" + makelinkplain);
                    wstream.write('"http://'+ makelinkplain + '",');
                }
            }
        });
      }

        start_url.shift();

        if(start_url.length > 0) {
          return req(start_url[0]);
        }

          wstream.end();
    });
}

req(start_url[0]);

Note: I also added a .toLowerCase() to the extractDomain() function since domains are not case sensitive, but a Set object is. 注意：我还向extractDomain()函数添加了.toLowerCase() ，因为域不区分大小写，而Set对象则区分大小写。 This will make sure that even domains that differ only in case are recognized as the same domain. 这样可以确保即使只有大小写不同的域也可以识别为同一域。

如何在没有重复的node.js流中写入数据？

问题描述

1 个解决方案

解决方案1
2 已采纳 2016-04-17 16:53:07

如何在没有重复的node.js流中写入数据？

问题描述

1 个解决方案

解决方案1 2 已采纳 2016-04-17 16:53:07

解决方案1
2 已采纳 2016-04-17 16:53:07