[英]Avoiding captcha with puppeteer and nodeJS
I'm trying to scrape some content from a website and I think that the captha is avoiding the task.我正在尝试从网站上抓取一些内容,我认为 captha 正在避免这项任务。 I'm using userAgent but it still does not work.
我正在使用 userAgent 但它仍然无法正常工作。 Here is the code:
这是代码:
// helper.js // helper.js
const puppeteer = require('puppeteer');
const userAgent = require('user-agents');
async function getDynamicPageHtml(url) {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setUserAgent(userAgent.toString());
await page.goto(url, { waitUntil: 'networkidle0' });
const html = await page.evaluate(() => document.querySelector('*').outerHTML);
await browser.close();
return html;
} catch (err) {
console.error(err);
return null;
}
}
module.exports = {
getDynamicPageHtml
}
// idealista.js //idealista.js
const cheerio = require('cheerio');
const browser = require('./helper');
async function getData() {
const html = await browser.getDynamicPageHtml('https://www.idealista.com/alquiler-habitacion/madrid/chamberi/con-precio-hasta_450,compartido-2-personas/?ordenado-por=fecha-publicacion-desc&ordenado-por=fecha-publicacion-desc');
console.log(html);
const $ = cheerio.load(html);
const announce = $('#main-content > section > article').map((index, element) => {
return $(element).first().text().trim();
}).toArray();
announce.forEach((element, index) => {
// do stuff
});
}
module.exports = {
getData
};
// app.js // 应用程序.js
const express = require('express');
const idealista = require('./idealista');
const app = express();
app.set('port', process.env.PORT || 3000);
app.use(express.json());
app.use(express.urlencoded({extended: true}))
app.listen(app.get('port'), async () => {
console.log('server on port ',app.get('port'));
await idealista.getData();
})
This is the output when I debug puppeteer content:这是我调试 puppeteer 内容时的 output:
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.