简体   繁体   中英

how to get python code to loop to next page properly when scraping a website?

i am trying to scrape a real estate website but I am having issues getting my code to go to the next page(25 pages total). Currently, it is just continuously scraping page 1. I am a pretty big newb at this so apologies if this is a dumb request.

import requests
from bs4 import BeautifulSoup
from csv import writer

base_url = 'https://www.rew.ca/properties/areas/kelowna-bc'

for i in range(1,26):
    url = '/page/' + str(i)

    while url:
        response = requests.get(f"{base_url}{url}")
        soup = BeautifulSoup(response.text, "html.parser")
        listings = soup.find_all("article")

        with open("property4.csv", "w") as csv_file:
            csv_writer = writer(csv_file)
            csv_writer.writerow(["title", "type", "price", "location", "bedrooms", "bathrooms", "square feet", "link"])
        for listing in listings:
            location = listing.find(class_="displaypanel-info").get_text().strip()
            price = listing.find(class_="displaypanel-title hidden-xs").get_text().strip()
            link = listing.find("a").get('href').strip()
            title = listing.find("a").get('title').strip()
            type = (listing.find(class_="clearfix hidden-xs").find(class_="displaypanel-info")).get_text()
            bedrooms = (listing.find_all("li")[2]).get_text()
            bathrooms = (listing.find_all("li")[3]).get_text()
            square_feet = (listing.find_all("li")[4]).get_text()
            csv_writer.writerow([title, type, price, location, bedrooms, bathrooms, square_feet, link])
            next_btn = soup.find(class_="paginator-next_page paginator-control")
            url = next_btn.find("a")["href"]

You had two issues with your loop.

  1. Indentation

The indentation of the find() statement, made the code find the button multiple times per page, that is unnecessary.

  1. while loop

The while loop stops you from progressing from page 1 to 2, since url is true even when you have found the next page. Simply remove that.

Here is a fixed version:

import requests
from bs4 import BeautifulSoup
from csv import writer

base_url = 'https://www.rew.ca/properties/areas/kelowna-bc'

for i in range(1,26):
    url = '/page/' + str(i)

    response = requests.get(f"{base_url}{url}")
    soup = BeautifulSoup(response.text, "html.parser")
    listings = soup.find_all("article")        
    #do you csv work here
    next_btn = soup.find(class_="paginator-next_page paginator-control")
    url = next_btn.find("a")["href"]
    print(url)

To develop your code a bit, I then broke out the csv logic intoa function, and used a while loop instead of for loop. The benefit of this, is that you dont need to update the loop if more listings make the pagination longer or shorter.

As I tried my code out, I found that the domain asks that you don't request faster than one page per 5 seconds, so I added a 5 second delay between scrapes

import requests
import time
from bs4 import BeautifulSoup as soup

def parse_listing(page_html):
  listings = soup.find_all("article")
  with open("property4.csv", "w") as csv_file:
    csv_writer = writer(csv_file)
    csv_writer.writerow(["title", "type", "price", "location", "bedrooms", "bathrooms", "square feet", "link"])

  for listing in listings:
    location = listing.find(class_="displaypanel-info").get_text().strip()
    price = listing.find(class_="displaypanel-title hidden-xs").get_text().strip()
    link = listing.find("a").get('href').strip()
    title = listing.find("a").get('title').strip()
    type = (listing.find(class_="clearfix hidden-xs").find(class_="displaypanel-info")).get_text()
    bedrooms = (listing.find_all("li")[2]).get_text()
    bathrooms = (listing.find_all("li")[3]).get_text()
    square_feet = (listing.find_all("li")[4]).get_text()
    csv_writer.writerow([title, type, price, location, bedrooms, bathrooms, square_feet, link])

prefix = 'https://www.rew.ca'
d = soup(requests.get('https://www.rew.ca/properties/areas/kelowna-bc').text, 'html.parser')

while True:
  parse_listing(d)
  next_page=d.find('a', {'rel': 'next'})
  if next_page:
      href_link=next_page.get('href')
      print(href_link)
      d= soup(requests.get(prefix + href_link).text, 'html.parser')
      time.sleep(5)
  else:
      print("no more 'next page'")
      break

Something like this ought to work. It's not pretty, but hopefully it helps you see how it's rotating through the pages.

import requests
from bs4 import BeautifulSoup
from csv import writer
import time

## use the actual base url since url returned from the page is /properties/areas/kelowna-bc/page/XX 
base_url = 'https://www.rew.ca'
url = '/properties/areas/kelowna-bc/page/1'

with open("property4.csv", "w") as csv_file:
    csv_writer = writer(csv_file)
    csv_writer.writerow(["title", "type", "price", "location", "bedrooms", "bathrooms", "square feet", "link"])
    while url:
        time.sleep(5) ## not sure how slow to make this but the site will start returning 429 if you scrape too fast.
        response = requests.get(f"{base_url}{url}")
        print(f"{response}, {response.url}")  # debugging -- helps show what page was actually requested.
        response.raise_for_status() # this will raise an exception if we don't get a 200 returned.

        soup = BeautifulSoup(response.text, "html.parser")
        listings = soup.find_all("article")
        for listing in listings:
            location = listing.find(class_="displaypanel-info").get_text().strip().split() ## you'll need to decide how to handle these
            price = listing.find(class_="displaypanel-title hidden-xs").get_text().strip()
            link = listing.find("a").get('href').strip()
            title = listing.find("a").get('title').strip()
            type = (listing.find(class_="clearfix hidden-xs").find(class_="displaypanel-info")).get_text()

            # not all listings contain bathroom and squarefootage 
            parts = listing.find_all("li")
            bedrooms = (parts[2]).get_text() if len(parts) >= 3 else None
            bathrooms = (parts[3]).get_text() if len(parts) >= 4 else None
            square_feet = (parts[4]).get_text() if len(parts) >= 5 else None
            csv_writer.writerow([title, type, price, location, bedrooms, bathrooms, square_feet, link])
            print(f"{title:<45} {type:<15} {price:<10} bath {bathrooms} Sqft {square_feet}")
        next_btn = soup.find(class_="paginator-next_page paginator-control")
        next_link = next_btn.find("a")
        url = next_link['href'] if next_link else None

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM