简体   繁体   中英

simple scraper application on node.js / jquery

I'm trying to build a small application on Node.js with jQuery, that will request the contents at the location supplied. If the URL points to a resource that is not HTML, it will output the content type returned. Otherwise, it will read the HTML data and output the title of the page and the description, followed by a list of clickable links on the page. Here's my code:

var request             = require('request'),    
    jsdom               = require('jsdom'),
    fs                  = require('fs'),
    colors              = require('colors'),
    argv                = require('optimist').argv;
    
    colors.setTheme({ c1: 'blue', c2: 'red', c3: 'inverse' });

var myFunc = function( link, cb ){

console.log( 'requesting page: '.c3 + link.c3 );

// Step 1 - request to the page
request({
        uri: link,
    }, function (err, response, body) {
    
        // Handle response issues
        if ( err || response.statusCode !== 200 ) {
            if ( !response ){
                console.log( 'Ooops! page doesn`t exist or wrong URL format'.c2 )
            } else {
                console.log('error: '+ response.statusCode )
            }
            cb();

        } else {

            console.log( 'response code: ' + response.statusCode )
            
            // Step 2 - invoking jsdom and jQuery
            jsdom.env({
                html: body,
                src: [
                    fs.readFileSync(__dirname + "/lib/jquery-1.9.1.min.js").toString()
                ],
                done: function(err, window) {
                    
                    if(err) {
                        cb();
                    } else {

                        var $ = window.$;


                        // Step 3, final part - parse content with jQuery selectors
                        console.log( '\nThis page is:\n'.c1 + $(body)[0]._ownerDocument._contentType )
                        console.log( '\nPage title: \n'.c1 + $('title').text().trim() );
                        console.log( $('head meta[name="description"]').attr('content') !== undefined ? '\nPage description: \n'.c1 + $('head meta[name="description"]').attr('content') + '\n' : '\nPage description: \n'.c1 + 'No description on the page\n'.c2);
                        console.log( '\nClickable links on the page: \n'.c1 )

                        $('a').each(function(){
                            if ( $(this).attr('href') !== undefined ){
                                console.log( $(this).attr('href').slice(0, 4) == 'http' ? $(this).attr('href') : link + $(this).attr('href'))
                            } 
                        });

                        cb();
                    }
                } 
            })
        }
    }
);

};

So its perfectly scrapes an html pages, but i don't know how to implement this part

If the URL points to a resource that is not HTML, it will output the content type returned.

Please, share an idea how to do this part. Thanks in advance!

I never used JQuery, and am a begginner in javascript, but what I would have done is to get the url in a string, removed everything until the last dot (exemple: " http://www.web.com/content.php " => "php"), compared it to 'html', and print this, since it will be the content type.

EDIT:

//url is a string
function validate (url) {
    //Get whatever is after the last dot in the url
    //Compare to html
    //Return true if it is equals, or false if not
    return (url.substr(url.lastIndexOf('.')) === 'html');
}

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM