簡體   English   中英

網絡實時抓取流聊天 (puppeteer.js)

[英]Web scraping a stream chat in real-time (puppeteer.js)

我想通過網絡抓取實時從流中獲取聊天內容。

試圖在 puppeeter 的 .then() 函數內創建一個 while 循環似乎並不有效,並且在某些實現中將其全部分解。

我能夠進行初始刮擦,但在所有情況下,程序都會結束並且不想遵循我實現的 while 循環。

沒有while循環的工作代碼

const puppeteer = require ('puppeteer');

//initiating Puppeteer
puppeteer
  .launch ()
  .then (async browser => {
    //opening a new page and navigating to the live stream
    const page = await browser.newPage ();
    await page.goto ('https://www.younow.com/Ken_Nara24');
    await page.waitForSelector ('body');
  
    //manipulating the page's content
    let getComments = await page.evaluate (() => {
    let comments = document.body.querySelectorAll ('.comment');
    let scrapeItems = [];

    

    comments.forEach (item => {
        let commentAuthor = item.querySelector ('div.user-card__header.mini-profile-launcher').innerText;
        let commentContent = '';
            try {
            commentContent = item.querySelector ('div.user-card__body.ng-star-inserted').innerText;
            } catch (err) {}
            scrapeItems.push ({
            commentAuthor: commentAuthor,
            commentContent: commentContent,
            });
        });
    
    
    let items = {
        "userComments": scrapeItems,
    };
    return items;
        
    });
    //outputting the scraped data
    console.log (getComments);
    //closing the browser
    await browser.close ();
  })
  //handling any errors
  .catch (function (err) {
    console.error (err);
  });

使邏輯循環的所有嘗試都是徒勞的。 我找不到一種方法或過去的問題/例子來清楚地定義如何或是否可以完成這樣的事情。 我自己做了一些嘗試來實現它,但什至沒有正確編譯。

我在這里遺漏了什么重要的東西嗎? 我只想聽一個網頁,每 3-5 秒重新抓取一次。

如果您仍然需要幫助,可以嘗試這種方式。

const puppeteer = require("puppeteer");
let pageScraping = false; /* set scraping to false */

const scraper = async () => {
  if (pageScraping == true) return; /* check if already scraping page */
  let browser, page;
  let pageUrl = 'https://www.younow.com/Ken_Nara24';

  try {
    pageScraping = true; /* set scraping to true */
    browser = await puppeteer.launch({ headless: true });
    page = await browser.newPage();
    await page.goto(pageUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });

    /* wait for chat to be visible */
    await page.waitForSelector('.chat', { visible: true, timeout: 60000 });

    let getComments = await page.evaluate(() => {
      let scrapeComments = [];
      let comments = document.querySelectorAll('.comment');

      comments.forEach(comment => {
        let commentContent = '';
        let commentAuthor = comment.querySelector('div[class="user-card__header mini-profile-launcher"]').innerText;
        commentContent = comment.querySelector('div[class="user-card__body ng-star-inserted"]').innerText;

        scrapeComments.push({
          'commentAuthor': commentAuthor,
          'commentContent': commentContent,
        });
      });

      return { 'userComments': scrapeComments };
    });

    console.log(await getComments); /* log comments */
  } catch (err) {
    console.log(err.message);
  } finally {
    if (browser) { /* check if browser is open befor trying to close */
      await browser.close();
      console.log('closing browser');
    }
    pageScraping = false; /* set scraping to false again */
    await setTimeout(scraper, 5000); /* wait 5 seconds befor re-scraping */
  }
}

setTimeout(scraper, 5000); /* start scraping */

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM