简体   繁体   中英

NodeJS x-ray web-scraper:multiple urls in a loop callback

im using x-ray, which is great but lack of tutorial. anyway, I use an array of urls named urls. In the loop ,each url fetched and return result through callback. in the callback function i need to know what was the url which was parsed. How can i know which url is returning if the callback returns only err and results? (is it really an x-ray question or js)

 xrayRamiLevy = function(){
  var index = 0;
  for (index; index < urls.length; index++){
   x(urls[index].url, '.product_item',
  [{
      title : '.prodDescDiv h3',
      description : '.prodBrand',
      imageUrl : '.image_icons_zone .image img@src',
      onclick : '.image_icons_zone .image a @onclick',
  }]
)
(function(err, results){
    for (var i = 0; i < results.length; i++){

      var s = results[i].onclick.substr(0, results[i].onclick.lastIndexOf("'"));
      s = s.slice(s.lastIndexOf("'") + 1);
      results[i].catalogueNumber = s;
      delete results[i].onclick;
      if (results[i].description !== undefined && results[i].description.length > 0)
      s = results[i].description.replace(/\s+/g, ' ').trim();
      results[i].description = s;

      if (urls[index].category !== undefined && urls[index].category.length > 0)
      results[i].categoriesIds = urls[index].category;

      if (urls[index].subcategory !== undefined && urls[index].subcategory.length > 0)
      results[i].subcategoriesIds = urls[index].subcategory;

    }
    fs.writeFile("./results.json", JSON.stringify(results, null, '\t'));
});

} }

I didn't fully get your example, HOWEVER:

urls.forEach(function(url, index){
  //whatever you need to do to prep your call to x
 var callback = x(url, '.product_item', ...);
 callback(wrappedCallback(url));
}

function wrappedCallback(url) {
  return function(err, results){
    // url is defined here
  }
};

See example of getting the url using JS closure. Note how the URL from the array is available in the fn callback.

var Xray = require('x-ray');
var util = require('util');
var x = Xray();

var sitesToHandle = ['https://dribbble.com?x=1', 'https://dribbble.com?x=2'];
sitesToHandle.forEach((urlToHandle) => {
  x(urlToHandle, 'li.group', [{
    title: '.dribbble-img strong',
    image: '.dribbble-img [data-src]@data-src',
  }]).(function (err, results) {
    console.log(`let's now handle the result of ${urlToHandle}, the results are ${util.inspect(results)}`);
  });
});

ps side note, when handling the returned errors, you might take a look at an error handling guide that I just wrote here

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM