简体   繁体   中英

Puppeteer looping through array of URLS to visit next page not working

I'm trying to have Puppeteer visit multiple urls in an array, but it does not seem to be working. It hangs forever.

Here's the snippet that seems to not be working. Note the URL using a template literal.

  //create array of urls
  let urlList = [];

  //add urls to array

  for (let i = 0; i < pageNumberToNumber; i++) {
    urlList.push(
      `https://www.realtor.ca/map#ZoomLevel=14&Center=43.771556%2C-79.427630&LatitudeMax=43.78395&LongitudeMax=-79.38710&LatitudeMin=43.75916&LongitudeMin=-79.46816&view=list&CurrentPage=${i}&Sort=6-D&PGeoIds=g10_dpz90869&GeoName=Willowdale%2C%20Toronto%2C%20ON&PropertyTypeGroupID=1&PropertySearchTypeId=1&TransactionTypeId=2&PriceMax=1500000&BuildingTypeId=1&Currency=CAD`
    );
  }

  console.log(urlList);
  console.log(urlList.length);

  for (let i = 0; i < urlList.length; i++) {
    const url = urlList[i];
    await page.goto(`${url}`);
    await page.waitForNavigation({ waitUntil: "networkidle2" });
  }

Here's the full code

const puppeteer = require("puppeteer-extra");
// Enable stealth plugin with all evasions
puppeteer.use(require("puppeteer-extra-plugin-stealth")());

(async () => {
  const browser = await puppeteer.launch({
    headless: false,
  });
  const page = await browser.newPage();

  let url =
    "https://www.realtor.ca/map#ZoomLevel=14&Center=43.771556%2C-79.427630&LatitudeMax=43.78395&LongitudeMax=-79.38710&LatitudeMin=43.75916&LongitudeMin=-79.46816&view=list&CurrentPage=1&Sort=6-D&PGeoIds=g10_dpz90869&GeoName=Willowdale%2C%20Toronto%2C%20ON&PropertyTypeGroupID=1&PropertySearchTypeId=1&TransactionTypeId=2&PriceMax=1500000&BuildingTypeId=1&Currency=CAD";

  await page.goto(url);
  await page.waitFor(10000);

  //page.evaluate lets you use JavaScript as it was being used in the developer console on the page
  const result = await page.evaluate(() => {
    let data = []; // Create an empty array that will store our data
    let elements = document.querySelectorAll(".cardCon"); // Select all Products

    for (var element of elements) {
      // Loop through each product
      let price =
        element.children[0].children[1].childNodes[1].children[1].childNodes[1]
          .childNodes[3].innerText; // Select the price
      let address =
        element.children[0].children[1].childNodes[1].children[1].childNodes[1]
          .childNodes[5].innerText; // Select the address
      let details = element.children[0].children[1].childNodes[1].children[1].childNodes[3].innerText.replace(
        /\n/g,
        " "
      ); // Select the details
      let url = element.children[0].childNodes[3].href; // Select the address

      data.push({ price, details, address, url }); // Push an object with the data onto our array
    }

    return data; // Return our data array
  });

  console.log(result);

  //get total page number
  let totalPageNumber = await page.$eval(
    "#ListViewPagination_Bottom > div > div > div > span.paginationTotalPagesNum",
    (element) => element.innerText
  );

  console.log(totalPageNumber);

  //convert total page number from string to number
  let pageNumberToNumber = parseInt(totalPageNumber);

  //create array of urls
  let urlList = [];

  //add urls to array

  for (let i = 0; i < pageNumberToNumber; i++) {
    urlList.push(
      `https://www.realtor.ca/map#ZoomLevel=14&Center=43.771556%2C-79.427630&LatitudeMax=43.78395&LongitudeMax=-79.38710&LatitudeMin=43.75916&LongitudeMin=-79.46816&view=list&CurrentPage=${i}&Sort=6-D&PGeoIds=g10_dpz90869&GeoName=Willowdale%2C%20Toronto%2C%20ON&PropertyTypeGroupID=1&PropertySearchTypeId=1&TransactionTypeId=2&PriceMax=1500000&BuildingTypeId=1&Currency=CAD`
    );
  }

  console.log(urlList);
  console.log(urlList.length);

  for (let i = 0; i < urlList.length; i++) {
    const url = urlList[i];
    await page.goto(`${url}`);
    await page.waitForNavigation({ waitUntil: "networkidle2" });
  }
})();

You will already have navigated when your goto is done, so your await page.waitForNavigation({ waitUntil: "networkidle2" }); never detects a navigation event.

In your case you can add an options argument in your goto call to decide on your waitUntil strategy.

Replace this

await page.goto(`${url}`);
await page.waitForNavigation({ waitUntil: "networkidle2" });

With this

await page.goto(`${url}`, { waitUntil: "networkidle2" });

Minimal example: Note that I make a new page for each URL to avoid the issues you are having with goto with this particular website.

const puppeteer = require("puppeteer-extra");
// Enable stealth plugin with all evasions
puppeteer.use(require("puppeteer-extra-plugin-stealth")());

(async function () {
  const browser = await puppeteer.launch({ headless: false });
  const page = await browser.newPage();

  //create array of urls
  let urlList = [];

  const pageNumberToNumber = 3; // Inserted by me

  console.log(urlList);
  //add urls to array
  for (let i = 1; i <= pageNumberToNumber; i++) {
    console.log(urlList);
    urlList.push(
      `https://www.realtor.ca/map#ZoomLevel=14&Center=43.771556%2C-79.427630&LatitudeMax=43.78395&LongitudeMax=-79.38710&LatitudeMin=43.75916&LongitudeMin=-79.46816&view=list&CurrentPage=${i}&Sort=6-D&PGeoIds=g10_dpz90869&GeoName=Willowdale%2C%20Toronto%2C%20ON&PropertyTypeGroupID=1&PropertySearchTypeId=1&TransactionTypeId=2&PriceMax=1500000&BuildingTypeId=1&Currency=CAD`
    );
  }

  console.log(urlList);
  console.log(urlList.length);

  for (let i = 0; i < urlList.length; i++) {
    // Creating new page to load each url.
    const tempPage = await browser.newPage();
    const url = urlList[i];
    await tempPage.goto(`${url}`, { waitUntil: "networkidle2" });
    // 5s wait to manually confirm page changed
    await tempPage.waitFor(5000);
    await tempPage.close();
  }
  await browser.close();
})();

Alternatively you can wrap both calls (and invoke waitForNavigation first) in a Promise.all .

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM