I tried to scrape Reddit using puppeteer and Node.js. There is my code, where I:
const puppeteer = require("puppeteer");
const self = {
browser: null,
page: null,
initialize: async () => {
browser = await puppeteer.launch({
headless: false,
});
page = await browser.newPage();
// Go to the index page of Reddit
await page.goto("https://old.reddit.com/", { waitUntil: "networkidle0" });
},
getResults: async () => {
let platform = "Reddit";
// Get all posts on the main page of Reddit.
let mentions = await page.$$('#siteTable > div[class *= "thing"]');
let results = [];
// For each post:
for (let mention of mentions) {
let content = "";
// I get the link to its content page.
let content_URL = await mention.$eval(
'p[class="title"] > a[class*="title"]',
(node) => node.getAttribute("href").trim()
);
// if it is a inner link:
if (content_URL.substr(0, 3) === "/r/") {
// Create a new page to open that content page.
let contentPage = await browser.newPage();
await contentPage.goto("https://old.reddit.com" + content_URL, {
waitUntil: "networkidle0",
});
// Get the first paragraph of this content page.
content = await contentPage.evaluate((contentPage) => {
// Here is where the error occurred:
// Error: Evaluation failed: TypeError: Cannot read property 'querySelector' of undefined
let firstParagraph = contentPage.querySelector(
'div[class*="usertext-body"] > p'
);
if (firstParagraph != null) {
return firstParagraph.innerText.trim();
} else {
return null;
}
});
}
results.push({
title,
content,
image,
date,
popularity,
platform,
});
}
return results;
},
};
module.exports = self;
But an error occurred: Error: Evaluation failed: TypeError: Cannot read property 'querySelector' of undefined
.
Could anyone point out where I did wrong plz?
Thanks!
page.evaluate
basically executes code in the context of the browser. IE: the same stuff you would put into the browser developer console to get the same result. So in this context, you would want to use document.querySelector()
instead of a reference to contentPage
which isn't defined:
let firstParagraph = document.querySelector(
'div[class*="usertext-body"] > p'
);
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.