简体   繁体   中英

Scrape page then click then scrape next using Puppeteer/Javascript

I'm trying to get my first Puppeteer script up and running. So far I can scrape the first page on but my click function doesn't work.

The goal is to be able to scrape every page (until the "next button") is not visible anymore.

Any suggestions on how to do so?

const puppeteer = require('puppeteer');


(async () => {
  //{headless: false}
  const browser = await puppeteer.launch();
  const page = await browser.newPage();

  await page.goto("pages", { waitUntil: 'domcontentloaded' });

  // example: get innerHTML of an element
  let urls = await page.evaluate(() => {
    const prop_list = document.querySelectorAll('.sold-property-listing');
    let list = [];
    let broker, price, price_kvm, address, kvm, rooms, date_sold, monthly_fee, size, area, type, id, url, bid_change;
    for(var i = 0; i < prop_list.length; i++){
      var el = prop_list[i];
      url = el.querySelector('a').href;
      id = "";
      broker = el.querySelector('.sold-property-listing__broker').innerText;
      price = el.querySelector('div.sold-property-listing__price > div:nth-child(1) > span').innerText;
      price_kvm = el.querySelector('div.sold-property-listing__price > div:nth-child(2) > div.sold-property-listing__price-per-m2.sold-property-listing--left').innerText
      date_sold = el.querySelector('.sold-property-listing__sold-date.sold-property-listing--left').innerText;
      address = el.querySelector('div.sold-property-listing__location > h2 > span.item-result-meta-attribute-is-bold.item-link').innerText;
      price = price.replace(/\D/g, '');
      price_kvm = price_kvm.replace(/\D/g, '');
      date_sold = date_sold.replace('Såld ', '');
      monthly_fee = el.querySelector('div.sold-property-listing__size > div > div.sold-property-listing__fee');
      size = el.querySelector('div.sold-property-listing__size > div > div.sold-property-listing__subheading.sold-property-listing--left').innerText.split('m²');
      area = el.querySelector('div.sold-property-listing__location > div > span.item-link')
      type = el.querySelector('div.sold-property-listing__location > div > span.hide-element')
      bid_change = document.querySelector('div.sold-property-listing__price-change');

      if(bid_change){bid_change = bid_change.innerText.replace(' %', '')}else{bid_change = '0'}
      if(area){area = area.innerText.replace(',', '');}else{area = 'Unkown'}
      if(url){id = url.split('-');id = id[id.length-1]}else{id = null}
      if(type){type = type.innerText;}else{type = 'Unkown'}
      if(size[0]){kvm = size[0].replace(/\s/g, '');}else{kvm = null;}
      if(size[1]){rooms = size[1].replace(/\D/g, '');}else{rooms = null;}
      if(monthly_fee){monthly_fee = monthly_fee.innerText.replace(/\D/g, '');}else{monthly_fee = null}

      list.push({'id': id, 'address': address, 'broker': broker, 'price': price, 'price_kvm': price_kvm, 'sold_date': date_sold, 'kvm': kvm, 'rooms': rooms, 'area': area, 'rooms': rooms, 'monthly_fee': monthly_fee, 'bid_change': bid_change})
    }
    return list;
  })
  console.log(urls);

    // Use Promise.all to wait for two actions (navigation and click)
  await Promise.all([
    page.waitForNavigation(), // wait for navigation to happen
    page.click('a.next_page'), // click link to cause navigation        
  ]);

  // close brower when we are done
  await browser.close();
})();

Came up with a solution that is working fine:

const puppeteer = require('puppeteer');

console.log('into web_scraper')
const webscraping = async pageAmount => {
  console.log('into webscraping')
  return new Promise(async (resolve, reject) => {
    try {
      if (!pageAmount) {
        pageAmount = 1;
      }
      const browser = await puppeteer.launch();
      const page = await browser.newPage();
      await page.goto("https://www.hemnet.se/salda/bostader?location_ids%5B%5D=898741");
      let currentPage = 1;
      let dataObj = [];
      while (currentPage <= pageAmount) {
        const results = await page.evaluate(() => {
          const prop_list = document.querySelectorAll('.sold-property-listing');
          let list = [];
          let broker, price, price_kvm, address, kvm, rooms, date_sold, monthly_fee, size, area, type, id, url, bid_change;
          for(var i = 0; i < prop_list.length; i++) {
            var el = prop_list[i];
            url = el.querySelector('a').href;
            id = ""
            broker = el.querySelector('.sold-property-listing__broker').innerText;
            price = el.querySelector('div.sold-property-listing__price > div:nth-child(1) > span').innerText;
            price_kvm = el.querySelector('div.sold-property-listing__price > div:nth-child(2) > div.sold-property-listing__price-per-m2.sold-property-listing--left').innerText
            date_sold = el.querySelector('.sold-property-listing__sold-date.sold-property-listing--left').innerText;
            address = el.querySelector('div.sold-property-listing__location > h2 > span.item-result-meta-attribute-is-bold.item-link').innerText;
            price = price.replace(/\D/g, '');
            price_kvm = price_kvm.replace(/\D/g, '');
            date_sold = date_sold.replace('Såld ', '');
            monthly_fee = el.querySelector('div.sold-property-listing__size > div > div.sold-property-listing__fee');
            size = el.querySelector('div.sold-property-listing__size > div > div.sold-property-listing__subheading.sold-property-listing--left')
              .innerText.split('m²');
            area = el.querySelector('div.sold-property-listing__location > div > span.item-link')
            type = el.querySelector('div.sold-property-listing__location > div > span.hide-element')
            bid_change = document.querySelector('div.sold-property-listing__price-change');
            url = document.location.href;
            if(bid_change){bid_change = bid_change.innerText.replace(' %', '')}else{bid_change = '0'}
            if(area){area = area.innerText.replace(',', '');}else{area = 'Unkown'}
            if(url){id = url.split('-');id = id[id.length-1]}else{id = null}
            if(type){type = type.innerText;}else{type = 'Unkown'}
            if(size[0]){kvm = size[0].replace(/\s/g, '');}else{kvm = null;}
            if(size[1]){rooms = size[1].replace(/\D/g, '');}else{rooms = null;}
            if(monthly_fee){monthly_fee = monthly_fee.innerText.replace(/\D/g, '');}else{monthly_fee = null}

            list.push({'url': url, 'id': id, 'address': address, 'broker': broker, 'price': price, 'price_kvm': price_kvm, 'sold_date': date_sold, 'kvm': kvm, 'rooms': rooms, 'area': area, 'rooms': rooms, 'monthly_fee': monthly_fee, 'bid_change': bid_change})
          }
          return list;
        })
        dataObj = dataObj.concat(results);
        if (currentPage < pageAmount) {
          await Promise.all([
            await page.waitForSelector('a.next_page'),
            await page.click('a.next_page'),
            await page.waitForSelector('.sold-property-listing')
          ])
        }
        currentPage++;
      }
      browser.close();
      return resolve(dataObj);
    } catch (e) {
      return reject(e);
    }
  })
}
module.exports = webscraping;

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM