I am trying to scrape the content of a page using the apify sdk. This also works nicely with the following code. But how can I force using the Apify SDK the headless mode as with puppeteer.launch({headless: true})?
Code for your reference:
async function scrape(number) {
let output = { links: [], title: [], content: [] };
const URL = "https://somepage/";
process.env.APIFY_LOCAL_STORAGE_DIR = '/someappfolder/apify_storage/run_' + number;
const requestQueue = await Apify.openRequestQueue(number);
await requestQueue.addRequest({ url: URL });
const pseudoUrls = [new Apify.PseudoUrl(URL + "[.*]")];
const crawler = new Apify.PuppeteerCrawler({
requestQueue,
handlePageFunction: async ({ request, page }) => {
output.links.push(request.url);
output.title.push(await page.title());
output.content.push((await page.content()).length);
var save = { url: request.url, title: await page.title(), content: (await page.content()).length };
//sendToAirtable(save);
console.log(`URL: ${request.url}`);
await Apify.utils.enqueueLinks({
page,
selector: 'a',
pseudoUrls,
requestQueue,
});
},
maxRequestsPerCrawl: 10,
maxConcurrency: 10,
minConcurrency: 2,
});
await crawler.run();
return output;
};
添加launchPuppeteerOptions: { headless: true }
与requestQueue
https://sdk.apify.com/docs/typedefs/launch-puppeteer-options#docsNav在同一级别
process.env.APIFY_HEADLESS = 1;
经过数小时的搜索,我偶然发现了答案... https://sdk.apify.com/docs/guides/environment-variables#apify_headless
Neither of the in-code answers here would work. I had to google this, and this seems to work.
const Apify = require('apify');
Apify.main(async () => {
const baseurl = 'https://thedomain.youwanna.check.com/somepage';
const requestQueue = await Apify.openRequestQueue();
await requestQueue.addRequest({ url: baseurl });
const options = {
requestQueue,
launchContext: {
launchOptions: {
headless: true,
slowMos: 1000,
}
},
handlePageFunction: async ({ request, page }) => {
const title = await page.title();
console.log(`Title of ${request.url}: ${title}`);
request
await Apify.utils.enqueueLinks({
requestQueue,
page,
pseudoUrls: [baseurl + '[.*]'],
});
},
};
const crawler = new Apify.PuppeteerCrawler(options);
await crawler.run();
});
You can add the headless option to the launchPuppeteerOptions
like this:
const crawler = new Apify.PuppeteerCrawler({
requestQueue,
launchPuppeteerOptions: {
headless: true,
ignoreHTTPSErrors: true,
// slowMo: 500,
},
maxRequestsPerCrawl: settings.maxurls,
maxConcurrency: settings.maxcrawlers,
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.