简体   繁体   中英

How can I get this node.js function to return a value

The following code is a modification of the soupselect demo example . It basically fetches some html and prints a list of links and stores them in a variable:

crawl = function(host)
    var select = require('soupselect').select,
        htmlparser = require("htmlparser"),
        http = require('http'),
        sys = require('sys');

    // fetch some HTML...
    var http = require('http');
    var client = http.createClient(80, host);
    var request = client.request('GET', '/',{'host': host});

    var newPages = []

    request.on('response', function (response) {
        response.setEncoding('utf8');

        var body = "";
        response.on('data', function (chunk) {
            body = body + chunk;
        });

        response.on('end', function() {

            // now we have the whole body, parse it and select the nodes we want...
            var handler = new htmlparser.DefaultHandler(function(err, dom) {
                if (err) {
                    sys.debug("Error: " + err);
                } else {

                    // soupselect happening here...
                    var titles = select(dom, 'a.title');

                    sys.puts("Top stories from reddit");
                    titles.forEach(function(title) {
                        sys.puts("- " + title.children[0].raw + " [" + title.attribs.href + "]\n");
                        newPages.push(title.attribs.href);
                    })
                }
            });

            var parser = new htmlparser.Parser(handler);
            parser.parseComplete(body);
        });
    });
    request.end();
}

what i really want is for this function to return newPages i want to be able to say newPages = crawl(host) ; Trouble is im not sure if this makes sense or where to put the return statement. I see that newPages exists before the request is ended but is empty after the request is over.

How do i make that function have a return value that is newPages ?

Felix is right, you can't. This is the closest you can get:

Change your function signature to

crawl = function(host, done)

and update your function body to this:

titles.forEach(function(title) {
                        sys.puts("- " + title.children[0].raw + " [" + title.attribs.href + "]\n");
                        newPages.push(title.attribs.href);
                        done(newPages);
                    })

then you can call crawl like this:

var processNewPages = function(pages){
// do something with pages here
...
};

crawl(host, processNewPages);

I like to use the request , cheerio and async modules to crawl web sites. This piece of code is shorter and I think more readable.

var request = require('request');
var cheerio = require('cheerio');
var async   = require('async');

function crawl(url, contentSelector, linkSelector, callback) {
    var results = [];
    var visited = {};

    var queue = async.queue(crawlPage, 5); // crawl 5 pages at a time
    queue.drain = callback; // will be called when finished

    function crawlPage(url, done) {
        // make sure to visit each page only once
        if (visited[url]) return done(); else visited[url] = true;

        request(url, function(err, response, body) {
            if (!err) {
                var $ = cheerio.load(body); // "jQuery"
                results = results.concat(contentSelector($)); // add something to the results
                queue.push(linkSelector($)); // add links found on this page to the queue
            }
            done();
        });
    }
}

function getStoryTitles($) {
    return $('a.title').map(function() { return $(this).text(); });
}

function getStoryLinks($) {
    return $('a.title').map(function() { return $(this).attr('href'); });
}

crawl('http://www.reddit.com', getStoryTitles, getStoryLinks, function(stories) {
    console.log(stories); // all stories!
});

In the end you get an array of all stories which you probably wanted in the first place, it's just a different syntax. You can update your function to behave similarly, like AndyD suggested.

In the future, you will be able to use generators which will let you get the stories without a callback function which is more like what you want. See this article for more details.

function* crawl(url) {
    // do stuff
    yield story;
}

var crawler = crawl('http://www.reddit.com');
var firstStory = crawler.next();
var secondStory = crawler.next();
// ...

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM