[英]Web scraping a stream chat in real-time (puppeteer.js)
我想通過網絡抓取實時從流中獲取聊天內容。
試圖在 puppeeter 的 .then() 函數內創建一個 while 循環似乎並不有效,並且在某些實現中將其全部分解。
我能夠進行初始刮擦,但在所有情況下,程序都會結束並且不想遵循我實現的 while 循環。
沒有while循環的工作代碼
const puppeteer = require ('puppeteer');
//initiating Puppeteer
puppeteer
.launch ()
.then (async browser => {
//opening a new page and navigating to the live stream
const page = await browser.newPage ();
await page.goto ('https://www.younow.com/Ken_Nara24');
await page.waitForSelector ('body');
//manipulating the page's content
let getComments = await page.evaluate (() => {
let comments = document.body.querySelectorAll ('.comment');
let scrapeItems = [];
comments.forEach (item => {
let commentAuthor = item.querySelector ('div.user-card__header.mini-profile-launcher').innerText;
let commentContent = '';
try {
commentContent = item.querySelector ('div.user-card__body.ng-star-inserted').innerText;
} catch (err) {}
scrapeItems.push ({
commentAuthor: commentAuthor,
commentContent: commentContent,
});
});
let items = {
"userComments": scrapeItems,
};
return items;
});
//outputting the scraped data
console.log (getComments);
//closing the browser
await browser.close ();
})
//handling any errors
.catch (function (err) {
console.error (err);
});
使邏輯循環的所有嘗試都是徒勞的。 我找不到一種方法或過去的問題/例子來清楚地定義如何或是否可以完成這樣的事情。 我自己做了一些嘗試來實現它,但什至沒有正確編譯。
我在這里遺漏了什么重要的東西嗎? 我只想聽一個網頁,每 3-5 秒重新抓取一次。
如果您仍然需要幫助,可以嘗試這種方式。
const puppeteer = require("puppeteer");
let pageScraping = false; /* set scraping to false */
const scraper = async () => {
if (pageScraping == true) return; /* check if already scraping page */
let browser, page;
let pageUrl = 'https://www.younow.com/Ken_Nara24';
try {
pageScraping = true; /* set scraping to true */
browser = await puppeteer.launch({ headless: true });
page = await browser.newPage();
await page.goto(pageUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
/* wait for chat to be visible */
await page.waitForSelector('.chat', { visible: true, timeout: 60000 });
let getComments = await page.evaluate(() => {
let scrapeComments = [];
let comments = document.querySelectorAll('.comment');
comments.forEach(comment => {
let commentContent = '';
let commentAuthor = comment.querySelector('div[class="user-card__header mini-profile-launcher"]').innerText;
commentContent = comment.querySelector('div[class="user-card__body ng-star-inserted"]').innerText;
scrapeComments.push({
'commentAuthor': commentAuthor,
'commentContent': commentContent,
});
});
return { 'userComments': scrapeComments };
});
console.log(await getComments); /* log comments */
} catch (err) {
console.log(err.message);
} finally {
if (browser) { /* check if browser is open befor trying to close */
await browser.close();
console.log('closing browser');
}
pageScraping = false; /* set scraping to false again */
await setTimeout(scraper, 5000); /* wait 5 seconds befor re-scraping */
}
}
setTimeout(scraper, 5000); /* start scraping */
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.