简体   繁体   中英

Pagination with Puppeteer

I watched 10+ tutorials but I'm still unable to solve this basic operation.

How do I paginate with puppeteer the Glassdoor page I'm scraping?

Here I am successfully logging into glassdoor(logging details have been replaced with ****), and scraping basic details such as companyName, datePosted etc. visible on the first page:

async function scrapeListings(page) {

//navigating to the list of jobs i.e. logging in to Glassdoor
try {
  await page.goto("https://www.glassdoor.co.uk/index.htm");
    await page.click(
      "#SiteNav > nav > div.d-lg-none.d-flex.align-items-center.justify-content-between.px-std.py-xsm.px-md-lg.py-md-std.LockedHomeHeaderStyles__bottomBorder.LockedHomeHeaderStyles__fullWidth > div.d-flex.justify-content-center.order-1.order-md-2.LockedHomeHeaderStyles__flexibleContainer > button",
      { delay: 200 }
    );
    await page.type("#userEmail", "*******", {
      delay: 200,
    });
    await page.type("#userPassword", "*******", { delay: 200 });
    await page.click(".mt-std.d-flex.flex-column.align-items-center", {
      delay: 200,
    });

    await page.waitForNavigation();
    await page.goto(
      "https://www.glassdoor.co.uk/Job/london-internship-jobs-SRCH_IL.0,6_IC2671300_KO7,17_IP1.htm"
    );
  

  const html = await page.content();
  const $ = cheerio.load(html);
  const listings = $("[data-test='jobListing']")
    .map((index, element) => {
      const titleElement = $(element).find(".css-l2wjgv.e1n63ojh0.jobLink");
      const timeElement = $(element).find("[data-test='job-age']");
      const companyName = $(titleElement).text();
    

      const url = "https://www.glassdoor.co.uk" + $(titleElement).attr("href");
      const datePosted = $(timeElement).text();
  

      return { companyName, url, datePosted };
    })
    .get();
  return listings; 
  await page.waitForNavigation({ waitUntil: 'networkidle0'})
} catch (erro) {
    console.error(erro);
  }
}

In this second part I get url of each job description and scrape a page with more details:


async function scrapeJobDescriptions(listings, page) {
  for (var i = 0; i < listings.length; i++) {
    await page.goto(listings[i].url);
    const html = await page.content();
    const $ = cheerio.load(html);
    // const jobDescription = $(".desc").text();
    const jobDescription = $("#JobDescriptionContainer").html();
    const location = $(".css-f4rs18.css-1e169oc.efy8art2 > div > div > div:nth-child(3)").text()
    const jobSalary = $(".css-1v5elnn.e11nt52q2 .small.css-10zcshf.e1v3ed7e1").text()
    const jobPosition = $(".css-17x2pwl").text();
    const applyLink =
      "https://www.glassdoor.co.uk" +
      $(".css-0.e1h54cx80 a").attr("data-job-url");
    try {
    const companyImage = await page.$eval(
      // ".oc-photo-gallery .photo__10vsfGte img",
      ".css-13u5hxa.epu0oo22 img",
      (img) => img.src
    );
   

    listings[i].jobDescription = jobDescription;
    listings[i].location = location;
    listings[i].jobSalary = jobSalary;
    listings[i].jobPosition = jobPosition;
    listings[i].applyLink = applyLink;
    listings[i].companyImage = companyImage;
    console.log(listings[i].jobDescription);
    const listingModel = new GlassdoorDB(listings[i]);
    await listingModel.save();
    await sleep(1000); //1 second sleep

  
  } catch(err) {
    console.log(err)
}
}
}

async function sleep(miliseconds) {
  return new Promise(resolve => setTimeout(resolve, miliseconds));
}

async function main() {
  await connectToMongoDb();
  const browser = await puppeteer.launch({ headless: false });
  const page = await browser.newPage();
  const listings = await scrapeListings(page);
  const listingsWithJobDescriptions = await scrapeJobDescriptions(
    listings,
    page
  );
  console.log(listings);
}

main();

How can I get to the next page after the first page with 30 results have been scraped?

You could check the URL pattern for the pagination - as Molda recommended - and instead of awaiting the puppeteer actions then clicking the "next" button: you can page.goto to the next ones. Eg: https://example.net?page=1 etc.

In most paginations, you can retrieve the number of the last page ( el.href.match(/\\d+/) ) if there is a "last" button to jump to the very last page.

Eg:

await page.goto('https://example.net')
const lastPage = await page.evaluate(el => el.href.match(/\d+/), (await page.$$('.pager-last > a'))[0])

for (let i = 0; i < parseInt(lastPage[0]) + 1; i++) {
  try {
    await page.goto('https://example.net?page=' + i)
    // manipulating the DOM
  } catch (e) {
    console.error(e)
  }
}

On the Glassdoor site you will be able to retrieve the length of the pagination from the following <div> :

<div class="cell middle d-none d-md-block py-sm" data-test="page-x-of-y">Page 1 of 17</div>
const lastPage = await page.evaluate(el => el.innerText.replace(/page\s\d\sof\s/i, ''), (await page.$$('[data-test="page-x-of-y"]')[0])

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM