简体   繁体   中英

NodeJS : Response of http.request is NOT expected

There is something weird when I use http.request module. Write a web crawler to fetch and parse the data of this webpage with NodeJS is my purpose.

But the response which http.request respond to me did not conform to the html render by Chrome.

Here is the code.

var https = require('https');
var fs = require('fs');

var options = {
    rejectUnauthorized: false,
    host: 'book.flypeach.com',
    path: '/default.aspx?ao=B2CZHTW&ori=KHH&des=KIX&dep=2015-09-10&ret=2015-09-17&adt=2&chd=0&inf=0&langculture=zh-TW&bLFF=false',
};

var callback = function(response) {
    var body = '';
    response.on('data', function(chunk) {
        body += chunk;
    });

    response.on('end', function() {
        fs.writeFile('craw.html' , body , function(err){
          if (err) return console.log(err);
        });
    });
}
https.request(options, callback).end();

I use fs.writeFile to store the output , but it was different with the webpage on Chrome Browser.

UPDATED : 2015 / 9 / 3

Today I tried phantomjs

Here is my new code. But it's not work either.

var system = require('system');
var page = require('webpage').create();
var url = "https://book.flypeach.com/default.aspx?ao=B2CZHTW&ori=KHH&des=KIX&dep=2015-09-10&ret=2015-09-17&adt=2&chd=0&inf=0&langculture=zh-TW&bLFF=false";

page.onResourceRequested = function (request) {
    system.stderr.writeLine('= onResourceRequested()');
    system.stderr.writeLine('  request: ' + JSON.stringify(request, undefined, 4));
};

page.onResourceReceived = function(response) {
    system.stderr.writeLine('= onResourceReceived()' );
    system.stderr.writeLine('  id: ' + response.id + ', stage: "' + response.stage + '", response: ' + JSON.stringify(response));
};

page.onLoadStarted = function() {
    system.stderr.writeLine('= onLoadStarted()');
    var currentUrl = page.evaluate(function() {
        return window.location.href;
    });
    system.stderr.writeLine('  leaving url: ' + currentUrl);
};

page.onLoadFinished = function(status) {
    system.stderr.writeLine('= onLoadFinished()');
    system.stderr.writeLine('  status: ' + status);
};

page.onNavigationRequested = function(url, type, willNavigate, main) {
    system.stderr.writeLine('= onNavigationRequested');
    system.stderr.writeLine('  destination_url: ' + url);
    system.stderr.writeLine('  type (cause): ' + type);
    system.stderr.writeLine('  will navigate: ' + willNavigate);
    system.stderr.writeLine('  from page\'s main frame: ' + main);
};

page.onResourceError = function(resourceError) {
    system.stderr.writeLine('= onResourceError()');
    system.stderr.writeLine('  - unable to load url: "' + resourceError.url + '"');
    system.stderr.writeLine('  - error code: ' + resourceError.errorCode + ', description: ' + resourceError.errorString );
};

page.onError = function(msg, trace) {
    system.stderr.writeLine('= onError()');
    var msgStack = ['  ERROR: ' + msg];
    if (trace) {
        msgStack.push('  TRACE:');
        trace.forEach(function(t) {
            msgStack.push('    -> ' + t.file + ': ' + t.line + (t.function ? ' (in function "' + t.function + '")' : ''));
        });
    }
    system.stderr.writeLine(msgStack.join('\n'));
};

page.open(url, function(status) {
  var title = page.evaluate(function() {
    return document.title;
  });

  console.log(status);
  phantom.exit();
});

I get the detailed log

= onNavigationRequested
  destination_url: https://book.flypeach.com/default.aspx?ao=B2CZHTW&ori=KHH&des=KIX&dep=2015-09-10&ret=2015-09-17&adt=2&chd=0&inf=0&langculture=zh-TW&bLFF=false
  type (cause): Other
  will navigate: true
  from page's main frame: true
= onResourceRequested()
  request: {
    "headers": [
        {
            "name": "User-Agent",
            "value": "Mozilla/5.0 (Unknown; Linux x86_64) AppleWebKit/534.34 (KHTML, like Gecko) PhantomJS/1.9.8 Safari/534.34"
        },
        {
            "name": "Accept",
            "value": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
        }
    ],
    "id": 1,
    "method": "GET",
    "time": "2015-09-03T08:42:29.674Z",
    "url": "https://book.flypeach.com/default.aspx?ao=B2CZHTW&ori=KHH&des=KIX&dep=2015-09-10&ret=2015-09-17&adt=2&chd=0&inf=0&langculture=zh-TW&bLFF=false"
}
= onLoadStarted()
  leaving url: about:blank
= onResourceError()
  - unable to load url: "https://book.flypeach.com/default.aspx?ao=B2CZHTW&ori=KHH&des=KIX&dep=2015-09-10&ret=2015-09-17&adt=2&chd=0&inf=0&langculture=zh-TW&bLFF=false"
  - error code: 6, description: SSL handshake failed
= onResourceReceived()
  id: 1, stage: "end", response: {"contentType":null,"headers":[],"id":1,"redirectURL":null,"stage":"end","status":null,"statusText":null,"time":"2015-09-03T08:42:29.845Z","url":"https://book.flypeach.com/default.aspx?ao=B2CZHTW&ori=KHH&des=KIX&dep=2015-09-10&ret=2015-09-17&adt=2&chd=0&inf=0&langculture=zh-TW&bLFF=false"}
= onLoadFinished()
  status: fail
fail
Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file://server.js. Domains, protocols and ports must match.

Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file://server.js. Domains, protocols and ports must match.

Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file://server.js. Domains, protocols and ports must match.

Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file://server.js. Domains, protocols and ports must match.

Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file://server.js. Domains, protocols and ports must match.

You need to use some rendering engine, such as WebKit

Try phantomjs for it

http://phantomjs.org/ https://github.com/sgentle/phantomjs-node WebKit renderer and nodejs module for command bindings. Working very well with most websites

The most obvious reason is that you don't use JavaScript in your web crawling. That sites does use JavaScript to change its' HTML. You can compare how the site looks with and without JavaScript by pressing F12 in Chrome to bring out the Developer Tools, then use the Settings button in the upper right and click the checkbox that says Disable JavaScript.

Sites can also render different HTML based on parameters, headers (such as User Agent) etc., but that's probably not what is happening.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM