简体   繁体   中英

Scraping multiple web pages with Cheerio

I'm learning to use Cheerio to scrape data from web pages. I know already how to get data from a single page but now I'm trying to figure out how to do the same with multiple pages.

I have two separate functions, one for both url. In my index.js I'm using the functions like this:

const express = require('express');
const scraper = require('./scraper');
const fs = require('fs');
const app = express();
 app.get('/search/:title', (req, res) => {
  scraper.func1(req.params.title).then(cars => {
    res.json(cars);
    fs.writeFile(
      './json/cars.json',
      JSON.stringify(cars, null, 2), // optional params to format it 
nicely
       err =>
         err
          ? console.error('Data not written!', err)
          : console.log('Data written!')
    );
  });
  scraper.func2(req.params.title).then(cars => {
    res.json(cars);
    fs.writeFile(
      './json/cars2.json',
      JSON.stringify(cars, null, 2), // optional params to format it 
nicely
      err =>
        err
          ? console.error('Data2 not written!', err)
          : console.log('Data2 written!')
    );
  });
});
const port = process.env.PORT || 3000;
app.listen(port, () => {
  console.log(`Listening on ${port}`);
}); 

Obviously these two functions don't work when chained like this. Seperately they both work just fine. So my question is, how should I chain these two functions to use them correctly?

I would use the async / await syntax for this purpose, it will keep the code a bit cleaner.

We'll call each function in sequence, then combine the response and send back to the client.

const express = require('express');
const scraper = require('./scraper');
const fs = require('fs');
const app = express();

function writeJsonToFile(fileName, data) {
    fs.writeFile(fileName,
        JSON.stringify(data, null, 2), // optional params to format it nicely
        err =>
        err
            ? console.error('Data not written!', err)
            : console.log(`Data written to file: ${fileName}!`)
    );
}

app.get('/search/:title', async (req, res) => {

    try { 
        let cars1 = await scraper.func1(req.params.title);
        writeJsonToFile('./json/cars1.json', cars1);

        let cars2 = await scraper.func2(req.params.title);
        writeJsonToFile('./json/cars2.json', cars2);

        let combinedResponse = { cars1, cars2 };
        res.json(combinedResponse);

    } catch (err) {
        res.json({ error: `Something bad happened: ${err.message}` });
    }
})

const port = process.env.PORT || 3000;
    app.listen(port, () => {
    console.log(`Listening on ${port}`);
});

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM