简体   繁体   中英

Scrape multiple websites using NodeJS, Express, Cherio and Axios

I would like to scrape multiple websites using NodeJS, Express, Cheerio and Axios. I'm able now to scrape 1 website and display the information to the HTML. But when I try to scrape multiple websites looking for the same element, it doesn't go through the forEach (stops after 1 cycle). Notice my loop which doesn't work correctly: urls.forEach(url => {

2 files that are the most important: index.js

const PORT = 8000
const axios = require('axios')
const cheerio = require('cheerio')
const express = require('express')
const app = express()
const cors = require('cors')
app.use(cors())

const urls = ['https://www.google.nl','https://www.google.de']
// const url = 'https://www.heineken.com/nl/nl/'
app.get('/', function(req, res){
  res.json('Robin')
})

urls.forEach(url => {
  app.get('/results', (req, res) => {
    axios(url)
      .then(response => {
        const html = response.data
        const $ = cheerio.load(html)
        const articles = []

        $('script', html).each(function(){
          const link = $(this).get()[0].namespace
          if (link !== undefined) {
            if (link.indexOf('w3.org') > -1) {
             articles.push({
               link
             })
            }
          }
        })
        res.json(articles)
      }).catch(err => console.log(err))
 })
})

app.listen(PORT, () => console.log('server running on PORT ${PORT}'))

App.js:

const root = document.querySelector('#root')

fetch('http://localhost:8000/results')
  .then(response => {return response.json()})
  .then(data => {
    console.log(data)
    data.forEach(article => {
      const title = `<h3>` + article.link + `</h3>`
      root.insertAdjacentHTML("beforeend", title)
    })
  })

You're registering multiple route handlers for the same route. Express will only route requests to the first one. Move your URL loop inside app.get("/results", ...) ...

app.get("/results", async (req, res, next) => {
  try {
    res.json(
      (
        await Promise.all(
          urls.map(async (url) => {
            const { data } = await axios(url);
            const $ = cheerio.load(data);
            const articles = [];

            $("script", html).each(function () {
              const link = $(this).get()[0].namespace;
              if (link !== undefined) {
                if (link.indexOf("w3.org") > -1) {
                  articles.push({
                    link,
                  });
                }
              }
            });
            return articles;
          })
        )
      ).flat() // un-nest each array of articles
    );
  } catch (err) {
    console.error(err);
    next(err); // make sure Express responds with an error
  }
});

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM