i am trying to scrape a real estate website but I am having issues getting my code to go to the next page(25 pages total). Currently, it is just continuously scraping page 1. I am a pretty big newb at this so apologies if this is a dumb request.
import requests
from bs4 import BeautifulSoup
from csv import writer
base_url = 'https://www.rew.ca/properties/areas/kelowna-bc'
for i in range(1,26):
url = '/page/' + str(i)
while url:
response = requests.get(f"{base_url}{url}")
soup = BeautifulSoup(response.text, "html.parser")
listings = soup.find_all("article")
with open("property4.csv", "w") as csv_file:
csv_writer = writer(csv_file)
csv_writer.writerow(["title", "type", "price", "location", "bedrooms", "bathrooms", "square feet", "link"])
for listing in listings:
location = listing.find(class_="displaypanel-info").get_text().strip()
price = listing.find(class_="displaypanel-title hidden-xs").get_text().strip()
link = listing.find("a").get('href').strip()
title = listing.find("a").get('title').strip()
type = (listing.find(class_="clearfix hidden-xs").find(class_="displaypanel-info")).get_text()
bedrooms = (listing.find_all("li")[2]).get_text()
bathrooms = (listing.find_all("li")[3]).get_text()
square_feet = (listing.find_all("li")[4]).get_text()
csv_writer.writerow([title, type, price, location, bedrooms, bathrooms, square_feet, link])
next_btn = soup.find(class_="paginator-next_page paginator-control")
url = next_btn.find("a")["href"]
You had two issues with your loop.
The indentation of the find()
statement, made the code find the button multiple times per page, that is unnecessary.
The while loop stops you from progressing from page 1 to 2, since url is true even when you have found the next page. Simply remove that.
Here is a fixed version:
import requests
from bs4 import BeautifulSoup
from csv import writer
base_url = 'https://www.rew.ca/properties/areas/kelowna-bc'
for i in range(1,26):
url = '/page/' + str(i)
response = requests.get(f"{base_url}{url}")
soup = BeautifulSoup(response.text, "html.parser")
listings = soup.find_all("article")
#do you csv work here
next_btn = soup.find(class_="paginator-next_page paginator-control")
url = next_btn.find("a")["href"]
print(url)
To develop your code a bit, I then broke out the csv logic intoa function, and used a while loop instead of for loop. The benefit of this, is that you dont need to update the loop if more listings make the pagination longer or shorter.
As I tried my code out, I found that the domain asks that you don't request faster than one page per 5 seconds, so I added a 5 second delay between scrapes
import requests
import time
from bs4 import BeautifulSoup as soup
def parse_listing(page_html):
listings = soup.find_all("article")
with open("property4.csv", "w") as csv_file:
csv_writer = writer(csv_file)
csv_writer.writerow(["title", "type", "price", "location", "bedrooms", "bathrooms", "square feet", "link"])
for listing in listings:
location = listing.find(class_="displaypanel-info").get_text().strip()
price = listing.find(class_="displaypanel-title hidden-xs").get_text().strip()
link = listing.find("a").get('href').strip()
title = listing.find("a").get('title').strip()
type = (listing.find(class_="clearfix hidden-xs").find(class_="displaypanel-info")).get_text()
bedrooms = (listing.find_all("li")[2]).get_text()
bathrooms = (listing.find_all("li")[3]).get_text()
square_feet = (listing.find_all("li")[4]).get_text()
csv_writer.writerow([title, type, price, location, bedrooms, bathrooms, square_feet, link])
prefix = 'https://www.rew.ca'
d = soup(requests.get('https://www.rew.ca/properties/areas/kelowna-bc').text, 'html.parser')
while True:
parse_listing(d)
next_page=d.find('a', {'rel': 'next'})
if next_page:
href_link=next_page.get('href')
print(href_link)
d= soup(requests.get(prefix + href_link).text, 'html.parser')
time.sleep(5)
else:
print("no more 'next page'")
break
Something like this ought to work. It's not pretty, but hopefully it helps you see how it's rotating through the pages.
import requests
from bs4 import BeautifulSoup
from csv import writer
import time
## use the actual base url since url returned from the page is /properties/areas/kelowna-bc/page/XX
base_url = 'https://www.rew.ca'
url = '/properties/areas/kelowna-bc/page/1'
with open("property4.csv", "w") as csv_file:
csv_writer = writer(csv_file)
csv_writer.writerow(["title", "type", "price", "location", "bedrooms", "bathrooms", "square feet", "link"])
while url:
time.sleep(5) ## not sure how slow to make this but the site will start returning 429 if you scrape too fast.
response = requests.get(f"{base_url}{url}")
print(f"{response}, {response.url}") # debugging -- helps show what page was actually requested.
response.raise_for_status() # this will raise an exception if we don't get a 200 returned.
soup = BeautifulSoup(response.text, "html.parser")
listings = soup.find_all("article")
for listing in listings:
location = listing.find(class_="displaypanel-info").get_text().strip().split() ## you'll need to decide how to handle these
price = listing.find(class_="displaypanel-title hidden-xs").get_text().strip()
link = listing.find("a").get('href').strip()
title = listing.find("a").get('title').strip()
type = (listing.find(class_="clearfix hidden-xs").find(class_="displaypanel-info")).get_text()
# not all listings contain bathroom and squarefootage
parts = listing.find_all("li")
bedrooms = (parts[2]).get_text() if len(parts) >= 3 else None
bathrooms = (parts[3]).get_text() if len(parts) >= 4 else None
square_feet = (parts[4]).get_text() if len(parts) >= 5 else None
csv_writer.writerow([title, type, price, location, bedrooms, bathrooms, square_feet, link])
print(f"{title:<45} {type:<15} {price:<10} bath {bathrooms} Sqft {square_feet}")
next_btn = soup.find(class_="paginator-next_page paginator-control")
next_link = next_btn.find("a")
url = next_link['href'] if next_link else None
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.