简体   繁体   中英

Scraping data with X-ray. Multiple sub pages

I'm trying to scrape www.metacritic.com for some data to create training module.

I'm able to use x-ray to scrape a single page but this particular page has a lot of subpages (for the 'letter categories'). I've tried to loop through several letters and perform multiple scrape events but i'm having trouble writing to a 'results.json' file using fs.appendFile.

I need a way to scrape THEN write to my file (its currently just wrong both functions immediately.

var Xray = require('x-ray');
var xray = Xray({
  filters: {
    trim: function (value) {
      return typeof value === 'string' ? value.trim() : value
    },
    reverse: function (value) {
      return typeof value === 'string' ? value.split('').reverse().join('') : value
    },
    slice: function (value, start , end) {
      return typeof value === 'string' ? value.slice(start, end) : value
    }
  }
});

var request = require('request');
var fs = require('fs')
var letters = ['a','b','c','d']
var resultObj = []

function eraseFile() {
  fs.writeFile('results.json', '', function() {console.log('Erased')})
}

eraseFile();

for (i = 0; i < letters.length; i++) {
  xray('https://www.metacritic.com/browse/tv/title/all/' + letters[i], 'li.season_product', [{
    title: '.product_title | trim',
    score: '.metascore_w',
    url: 'a@href'
  }])
  .paginate('.flipper.next a@href')
  (function(err, obj) {
    if (err) { console.log(err) }
    resultObj.concat(obj)
  })
}

fs.appendFile('results.json', JSON.stringify(resultObj), function(err) {
  if (err) { console.log(err) }

  console.log('scraped data saved to results.json')
})

You are not using promises correctly. When you write to file the asynchronous code has not finished yet.

You could just not use the one resultObj and append each result to the file as they are received. A problem that still exist is that you are bombarding the site with requests and the site may block you or see your requests as a ddos attack. I can provide a throttled example if you need it but without throttling the code would look something like this:

var Xray = require('x-ray');
var xray = Xray({
  filters: {
    trim: function (value) {
      return typeof value === 'string' ? value.trim() : value
    },
    reverse: function (value) {
      return typeof value === 'string' ? value.split('').reverse().join('') : value
    },
    slice: function (value, start , end) {
      return typeof value === 'string' ? value.slice(start, end) : value
    }
  }
});

var request = require('request');
var fs = require('fs')
var letters = ['a','b','c','d']

function eraseFile() {
  fs.writeFile('results.json', '', function() {console.log('Erased')})
}

eraseFile();

const makeXrayRequestFunction =
  letter =>
  () =>
    xray('https://www.metacritic.com/browse/tv/title/all/' + letter, 'li.season_product', [{
      title: '.product_title | trim',
      score: '.metascore_w',
      url: 'a@href'
    }])
    .paginate('.flipper.next a@href')
;
const handleXrayFinishedRequest =
  obj =>
    //resultObj.concat(obj);//doing nothing with obj
    new Promise(//append single result to file 
      (resove,reject) =>{
        fs.appendFile(
          'results.json'
          , JSON.stringify(obj)
          , err =>
            err? 
              reject(err) //could not write to file, return rejected promise
              : resolve(obj) //could write to file, return obj
        )
      }
    )
;
const failedXrayRequest = //just log the error when failed
  err =>
    console.log("failed:",err)
;
Promise.all(
  letters.map( //map letters array to functions that when called will have xray make the request
    makeXrayRequestFunction
  )
  .map(
    xRayFunction =>
      xRayFunction() //call the xray function, this should return a promise
      .then(
        handleXrayFinishedRequest //xray request was successfull, try to append to file
      )
      .then(
        undefined
        ,failedXrayRequest //either xray or file writing failed, handle it
      )
    )
)
.then(
  resultObj =>
    console.log("Finished scraping")
)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM