I watched 10+ tutorials but I'm still unable to solve this basic operation.
How do I paginate with puppeteer the Glassdoor page I'm scraping?
Here I am successfully logging into glassdoor(logging details have been replaced with ****), and scraping basic details such as companyName, datePosted etc. visible on the first page:
async function scrapeListings(page) {
//navigating to the list of jobs i.e. logging in to Glassdoor
try {
await page.goto("https://www.glassdoor.co.uk/index.htm");
await page.click(
"#SiteNav > nav > div.d-lg-none.d-flex.align-items-center.justify-content-between.px-std.py-xsm.px-md-lg.py-md-std.LockedHomeHeaderStyles__bottomBorder.LockedHomeHeaderStyles__fullWidth > div.d-flex.justify-content-center.order-1.order-md-2.LockedHomeHeaderStyles__flexibleContainer > button",
{ delay: 200 }
);
await page.type("#userEmail", "*******", {
delay: 200,
});
await page.type("#userPassword", "*******", { delay: 200 });
await page.click(".mt-std.d-flex.flex-column.align-items-center", {
delay: 200,
});
await page.waitForNavigation();
await page.goto(
"https://www.glassdoor.co.uk/Job/london-internship-jobs-SRCH_IL.0,6_IC2671300_KO7,17_IP1.htm"
);
const html = await page.content();
const $ = cheerio.load(html);
const listings = $("[data-test='jobListing']")
.map((index, element) => {
const titleElement = $(element).find(".css-l2wjgv.e1n63ojh0.jobLink");
const timeElement = $(element).find("[data-test='job-age']");
const companyName = $(titleElement).text();
const url = "https://www.glassdoor.co.uk" + $(titleElement).attr("href");
const datePosted = $(timeElement).text();
return { companyName, url, datePosted };
})
.get();
return listings;
await page.waitForNavigation({ waitUntil: 'networkidle0'})
} catch (erro) {
console.error(erro);
}
}
In this second part I get url of each job description and scrape a page with more details:
async function scrapeJobDescriptions(listings, page) {
for (var i = 0; i < listings.length; i++) {
await page.goto(listings[i].url);
const html = await page.content();
const $ = cheerio.load(html);
// const jobDescription = $(".desc").text();
const jobDescription = $("#JobDescriptionContainer").html();
const location = $(".css-f4rs18.css-1e169oc.efy8art2 > div > div > div:nth-child(3)").text()
const jobSalary = $(".css-1v5elnn.e11nt52q2 .small.css-10zcshf.e1v3ed7e1").text()
const jobPosition = $(".css-17x2pwl").text();
const applyLink =
"https://www.glassdoor.co.uk" +
$(".css-0.e1h54cx80 a").attr("data-job-url");
try {
const companyImage = await page.$eval(
// ".oc-photo-gallery .photo__10vsfGte img",
".css-13u5hxa.epu0oo22 img",
(img) => img.src
);
listings[i].jobDescription = jobDescription;
listings[i].location = location;
listings[i].jobSalary = jobSalary;
listings[i].jobPosition = jobPosition;
listings[i].applyLink = applyLink;
listings[i].companyImage = companyImage;
console.log(listings[i].jobDescription);
const listingModel = new GlassdoorDB(listings[i]);
await listingModel.save();
await sleep(1000); //1 second sleep
} catch(err) {
console.log(err)
}
}
}
async function sleep(miliseconds) {
return new Promise(resolve => setTimeout(resolve, miliseconds));
}
async function main() {
await connectToMongoDb();
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
const listings = await scrapeListings(page);
const listingsWithJobDescriptions = await scrapeJobDescriptions(
listings,
page
);
console.log(listings);
}
main();
How can I get to the next page after the first page with 30 results have been scraped?
You could check the URL pattern for the pagination - as Molda recommended - and instead of awaiting the puppeteer actions then clicking the "next" button: you can page.goto
to the next ones. Eg: https://example.net?page=1 etc.
In most paginations, you can retrieve the number of the last page ( el.href.match(/\\d+/)
) if there is a "last" button to jump to the very last page.
Eg:
await page.goto('https://example.net')
const lastPage = await page.evaluate(el => el.href.match(/\d+/), (await page.$$('.pager-last > a'))[0])
for (let i = 0; i < parseInt(lastPage[0]) + 1; i++) {
try {
await page.goto('https://example.net?page=' + i)
// manipulating the DOM
} catch (e) {
console.error(e)
}
}
On the Glassdoor site you will be able to retrieve the length of the pagination from the following <div>
:
<div class="cell middle d-none d-md-block py-sm" data-test="page-x-of-y">Page 1 of 17</div>
const lastPage = await page.evaluate(el => el.innerText.replace(/page\s\d\sof\s/i, ''), (await page.$$('[data-test="page-x-of-y"]')[0])
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.