简体   繁体   中英

Node.js web crawler : Maximum call stack exceeded

I'm new to node.js and I'm trying to figure out why I'm getting a maximum call stack exceeded error.

It's a web crawler that has 3 pieces.

  1. standard event emitter
  2. function - doRequest(url)
  3. the manager that takes the results of the requests (emitted as events) and updates the database accordingly.

The manager will call doRequest up to a set number of times simultanesously since they are all asynchronous calls. When a request is done by emitting an event, it logs the result, grabs the next url from the DB of scrapped urls, and then then emits it as available. That emit causes the manager to once again call the doRequest function.

I'm really really lost as to why this is giving me a maximum call stack exceeded issue. I've also put in those console.trace() arguments and none of the call stacks traced is that large.

var emitter = new (require('events').EventEmitter);
var inqueue = 0;
function doRequest(url) {   
        console.trace();
        setImmediate(function(){
            request(url, function (error, response, html) {
              if (!error && response.statusCode == 200) {
                var $ = cheerio.load(html);
                $('a').each(function(i, element){
                  var a = $(this);
                  if(a.attr("href")) {
                      if(a.attr('href').indexOf("http://")!= -1){
                          var url = a.attr('href');
                          // Our parsed meta data object
                          setImmediate(function(){emitter.emit("gotUrl", url)},inqueue);
                      } /*
                            links without an http:// in their href 
                            are useless to us.  They are either buttons or relative links
                            to sites we can't buy as only a section of the site is dead.
                         */
                  } else {
                     //link that isn't a link.
                  }
                });
                setImmediate(function(){emitter.emit("scraped", url)}); 
              } else {
                setImmediate(function(){emitter.emit("dead",url)});

              }
              return true;
            });
        }, 500);
}



module.exports = function() {
    console.log("built");
    Error.stackTraceLimit= Infinity;
    var maxthreads = 1;
    var running = 0;
    var urls = [];
    var doneurls = [];
    var excluderegex = / /;
    var index = 0;
    var ORM = null;
    var total = 0;

    var storage = null;
    var visited = null;
    var queuelinks = null;

    var cleanupThreshold = 5;
    var cleanupMin  = 3
    var timeout = 0; //set to zero for initial pull, then moved to 500 in doRequest.
    var theManager = this;
    this.logTheFallen = function(url) {
        storage.create({url:url}).success(function(){
            console.log("found dead link:"+url);
            setImmediate(function(){emitter.emit("available")});
        });
    };

    this.setUrls = function(arr) {   
        console.log(arr);
       queuelinks.create({url:arr[0]}).success(function(q){
            console.log("Queued again: "+q.url);
            setImmediate(theManager.run);           
        });

    };

    this.setExcluders = function(exclude) {
        excluderegex = exclude;
    }

    this.setOrm = function(zOrm) {
        ORM = zOrm;
    }

    this.setStorage =  function(model) {
        storage =  model;
    }
    this.setVisited = function(model) {
        visited = model;
    }
    this.setQueue = function(model) {
        queuelinks =  model;
    }

    this.setCleanupThreshold = function(thres){ cleanupThreshold = thres};



    this.threadfinished = function(){

        queuelinks.count().success(function(count){
            console.log("There are "+count+" that have been added to the queue.  On index:"+index);
            //DO NOT mess with this synchronous emit.  It makes things fail for some reason.
        });
        emitter.emit("available");

        while(running<maxthreads) {
            running++;
            setImmediate(function(){emitter.emit("available")});
        }
    };

    this.getUrl =function() {
        inqueue++;
        console.trace()
        console.log("getting url");
        index++;
        queuelinks.find({where:{id:index}}).success(function(item){
            console.log("success");
            console.trace();
            if(item){
                console.log("found item");
                setImmediate(function(){emitter.emit("newurl", item.url)});
            } else {
                index--;
                console.log("expended");
                setImmediate(function(){emitter.emit("expended")});
            }
        }).error(function(err){
            console.log(err);
        });
    };

    this.addToQueue =function(zurl) {
        console.log("queueing link"+zurl);
        queuelinks.findOrCreate({url:zurl}).success(function(item){
            console.trace();
            inqueue--;
        });
    };

    this.logUrl = function(href) {
        //console.log(href);
    };

    this.newThread = function() {
        console.log("Launching a new request");
        console.trace();
        running++;
        setImmediate(function(){theManager.getUrl()});

    };

    this.run = function() {
        console.log("running");
        var ind = 1;
        function launchThread(ind){

            queuelinks.find({where:{id:ind}}).success(function(queued){
                if(queued) {
                ind++;
                console.log("launching thread...");
                newThread();
                launchThread(ind);
                }
            });
            console.log(ind);
        };
        setImmediate(function(){launchThread(ind)});
    }

    emitter.on("dead", function(url){
        setImmediate(function(){theManager.logTheFallen(url)});
    });


    emitter.on("newurl", function(url){
        console.log("got new url:"+url);

       setImmediate(function(){doRequest(url)});
    });

    emitter.on("gotUrl", function(url){
        setImmediate(function(){theManager.addToQueue(url)});
    });

    emitter.on("scraped", function(url){
        var zUrl = new String(url);
        setImmediate(function(){
            console.trace();
            visited.findOrCreate({url:url}).success(function(visited) {
                if(visited.created){
                    console.log("added visited link");
                } else{ 
                    console.log("already added");
                    setImmediate(theManager.threadfinished)
                }
            });
        },0);
    });

    emitter.on("available", function(){
        console.log("launching thread");
        console.trace();
        theManager.newThread();
    });
    emitter.on("expended", function() {
        console.log("expended");
    });
    return this;
}

Naturally, as soon as I post to SO I figure it out. Here's the deal for those who come after as far as I can tell.

Any sort of pub/sub sort of behavior (like with events) can be prone to this if the subscribers push stuff to the publishers and this will eventually create a chain. Emitted events and their response still have a call stack sequence - they aren't truly async like I thought they were.

SO, the simple answer is this:

emitter.on("yourEvent", function(){
   setTimeout(yourFunction(), 0);
})

This will clear the callstack and create a new one. TADA!

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM