如何使用 puppeteer 抓取新頁面？

Question

我嘗試使用 puppeteer 和 Node.js 來抓取 Reddit。 有我的代碼，我在哪里：

為 Reddit 的主頁打開一個頁面，
獲取所有帖子。
對於每篇文章，我都會獲得指向其內容頁面的鏈接。
為每個內容頁面打開一個新頁面。
刮掉每個內容頁面。

const puppeteer = require("puppeteer");

const self = {
  browser: null,
  page: null,

  initialize: async () => {
    browser = await puppeteer.launch({
      headless: false,
    });
    page = await browser.newPage();

    // Go to the index page of Reddit
    await page.goto("https://old.reddit.com/", { waitUntil: "networkidle0" });
  },

  getResults: async () => {
    let platform = "Reddit";

    // Get all posts on the main page of Reddit.
    let mentions = await page.$$('#siteTable > div[class *= "thing"]');
    let results = [];

    // For each post:
    for (let mention of mentions) {
      let content = "";

      // I get the link to its content page.
      let content_URL = await mention.$eval(
        'p[class="title"] > a[class*="title"]',
        (node) => node.getAttribute("href").trim()
      );

      // if it is a inner link:
      if (content_URL.substr(0, 3) === "/r/") {

        // Create a new page to open that content page. 
        let contentPage = await browser.newPage();
        await contentPage.goto("https://old.reddit.com" + content_URL, {
          waitUntil: "networkidle0",
        });

        // Get the first paragraph of this content page.
        content = await contentPage.evaluate((contentPage) => {
          
          // Here is where the error occurred: 
          // Error: Evaluation failed: TypeError: Cannot read property 'querySelector' of undefined
          let firstParagraph = contentPage.querySelector(
            'div[class*="usertext-body"] > p'
          );

          if (firstParagraph != null) {
            return firstParagraph.innerText.trim();
          } else {
            return null;
          }
        });
      }

      results.push({
        title,
        content,
        image,
        date,
        popularity,
        platform,
      });
    }

    return results;
  },
};

module.exports = self;

但是發生了錯誤： Error: Evaluation failed: TypeError: Cannot read property 'querySelector' of undefined 。

誰能指出我做錯了什么？

謝謝！

Answer 1

page.evaluate基本上在瀏覽器的上下文中執行代碼。 IE：與您放入瀏覽器開發人員控制台以獲得相同結果的相同內容。 因此，在這種情況下，您可能希望使用document.querySelector()而不是對未定義的contentPage的引用：

let firstParagraph = document.querySelector(
  'div[class*="usertext-body"] > p'
);

如何使用 puppeteer 抓取新頁面？

問題描述

1 個解決方案

解決方案1
2 已采納 2020-07-01 22:46:20

如何使用 puppeteer 抓取新頁面？

問題描述

1 個解決方案

解決方案1 2 已采納 2020-07-01 22:46:20

解決方案1
2 已采納 2020-07-01 22:46:20