简体   繁体   中英

Puppeteer with lazy loading images

So I am trying to pull out information using data scraping from this real estate website ( https://www.zillow.com/vancouver-bc/ ) I am able to get all the information about the listing on the page but with images (image links/src), after a few of them, the result is some garbage. I tried researching and found it was because of lazy loading. For which is tried almost all the methods available and answered by others but none seem to work - this includes scrolling to the bottom, scrolling with delays ( https://www.npmjs.com/package/puppeteer-autoscroll-down ), zooming out the browser as much as I can to get the images to render. But it still doesn't work. I have been looking everywhere for hours now before I decided to post my question and code here itself for anyone else to figure it out.

let cheerio        = require('cheerio')
let puppeteer      = require('puppeteer-extra')
const pluginStealth = require("puppeteer-extra-plugin-stealth")
puppeteer.use(pluginStealth())
let userAgent      = require('random-useragent')
const baseURL      = "https://www.zillow.com/vancouver-bc"
let estateData     = []
let urlLinks       = []

let scrollPageToBottom = require('puppeteer-autoscroll-down')


let getEstateData = async () => {
    estateData = []
    urlLinks   = []
    let url
    for (let pgNum = 1; pgNum <= 1; pgNum++) {
        if (pgNum === 1) {
            url = baseURL + "/"
        } else {
            url = baseURL + ("/" + pgNum + "_p")
        }
        urlLinks.push(url)
    }
    await searchWebsite()
    console.log("search over")
    return estateData
    //module.exports = estateData
}

let searchWebsite = async () => {
    await puppeteer
        .launch({headless : false})
        .then(async function (browser) {
            let page = await browser.newPage();
            // await page.setRequestInterception(true)
            //
            // page.on('request', (req) => {
            //     if( req.resourceType() === 'image' || req.resourceType() === 'stylesheet' || req.resourceType() === 'font'){
            //         req.abort()
            //     }
            //     else {
            //         req.continue()
            //     }
            //
            // })

            let html
            await page.setUserAgent(userAgent.getRandom())
            for(let url of urlLinks){
                console.log(url)
                await page.goto(url).then(async function () {
                    html = await page.content();
                    let obj = await cheerio('.list-card-link.list-card-info', html)
                    let imgObj = await cheerio(".list-card-top", html)
                    let geoLocation = await cheerio(".photo-cards.photo-cards_wow", html)


                    // await page.waitForSelector('img',{
                    //     visible: true,
                    // })
                    // await page.evaluate(() => { window.scrollTo(0, document.body.scrollHeight)})
                    const scrollStep = 250 // default
                    const scrollDelay = 100 // default
                    const lastPosition = await scrollPageToBottom(page, scrollStep, scrollDelay)
                    await page.waitFor(2000)

                    let num = 0
                    console.log(obj.length)
                    for (let key in obj) {
                        if (obj[key].attribs) {
                            try {
                                let geoStr = await geoLocation[0].children[0].children[0].children[0].data
                                let geoObj = await (JSON.parse(geoStr)["geo"])

                                let extractedInfo = {
                                    estateName : await obj[key].children[0].children[0].data,
                                    estatePrice : await obj[key].children[2].children[0].children[0].data,
                                    saleType : await obj[key].children[1].children[0].next.data,
                                    estateConfig : {
                                        beds :  await obj[key].children[2].children[1].children[0].children[0].data,
                                        bath :  await obj[key].children[2].children[1].children[1].children[0].data,
                                        area :  await obj[key].children[2].children[1].children[2].children[0].data
                                    },
                                    estateLocation : {
                                        longitude : await geoObj.longitude,
                                        latitude : await geoObj.latitude
                                    },
                                    estateLink : await obj[key].attribs.href,
                                    estateCoverImgLink : await imgObj[num++].children[2].children[0].attribs.src
                                }
                                console.log(extractedInfo.estateName, imgObj[num].children[2].children[0].attribs.src)
                                await estateData.push(extractedInfo)
                            }
                            catch (e) {
                                console.log("Estate Skipped - ", obj[key].children[0].children[0].data, obj[key].attribs.href)
                                console.log(e)
                            }
                        }
                    }
                    console.log(estateData.length)
                });
            }
            //Now read the page

            console.log("total - ", estateData.length)
            await page.close()
            await browser.close()
        })
        .catch(function (err) {
            console.log(err)
        });
}

module.exports.getEstateData = getEstateData

I had a similar issue and found a working answer here . Hopefully this works for you too. The interval was a little slow so I changed it from 100 to 30.

I was able to solve this with a pretty simple implementation using the puppeteer-autoscroll-down library as you mentioned. I'm not sure which images you were specifically attempting to grab, but this worked for me.

// Set the initial viewport and navigate to the page
await page.setViewport({ width: 1300, height: 1000 });
await page.goto('https://www.zillow.com/vancouver-bc/', { waitUntil: 'load' });

// Scroll to the very top of the page
await page.evaluate(_ => {
      window.scrollTo(0, 0);
});

// Scroll to the bottom of the page with puppeteer-autoscroll-down
await scrollPageToBottom(page);

// Get your image links
let imageLinks = await page.$$eval('.list-card img', imgLinks => {
    return imgLinks.map((i) => i.src);
});

imageLinks was an array with 40 fully formed links, https://photos.zillowstatic.com/p_e/ISz7wlfm278p501000000000.jpg is one example.

Hope that helps you, this was a pretty brutal one for me to solve as well.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM