[英]I can't scrape the rest of the articles in a real estate website using python
[英]how to scrape all the pages on a real estate website using pyton?
我需要一些幫助來抓取房地產網站的多個頁面。 我已經編寫了成功抓取第 1 頁的代碼,並嘗試實現代碼來抓取其中的所有 25 頁,但現在卡住了。 任何提示/幫助將不勝感激。
import requests
from bs4 import BeautifulSoup
from csv import writer
base_url = 'https://www.rew.ca/properties/areas/kelowna-bc'
url = '/page/2'
while url:
response = requests.get(f"{base_url}{url}")
soup = BeautifulSoup(response.text, "html.parser")
listings = soup.find_all("article")
with open("property4.csv", "w") as csv_file:
csv_writer = writer(csv_file)
csv_writer.writerow(["title", "type", "price", "location", "bedrooms", "bathrooms", "square feet", "link"])
for listing in listings:
location = listing.find(class_="displaypanel-info").get_text().strip()
price = listing.find(class_="displaypanel-title hidden-xs").get_text().strip()
link = listing.find("a").get('href').strip()
title = listing.find("a").get('title').strip()
type = (listing.find(class_="clearfix hidden-xs").find(class_="displaypanel-info")).get_text()
bedrooms = (listing.find_all("li")[2]).get_text()
bathrooms = (listing.find_all("li")[3]).get_text()
square_feet = (listing.find_all("li")[4]).get_text()
csv_writer.writerow([title, type, price, location, bedrooms, bathrooms, square_feet, link])
next_btn = soup.find(class_="paginator-next_page paginator-control")
url = next_btn.find("a")["href"] if "href" else None
每次抓取頁面時,您都應該增加頁碼。 嘗試這個:
import requests
from bs4 import BeautifulSoup
from csv import writer
base_url = 'https://www.rew.ca/properties/areas/kelowna-bc'
for i in range(1, 26):
url = '/page/' + str(i)
while url:
response = requests.get(f"{base_url}{url}")
soup = BeautifulSoup(response.text, "html.parser")
listings = soup.find_all("article")
with open("property4.csv", "w") as csv_file:
csv_writer = writer(csv_file)
csv_writer.writerow(["title", "type", "price", "location", "bedrooms", "bathrooms", "square feet", "link"])
for listing in listings:
location = listing.find(class_="displaypanel-info").get_text().strip()
price = listing.find(class_="displaypanel-title hidden-xs").get_text().strip()
link = listing.find("a").get('href').strip()
title = listing.find("a").get('title').strip()
type = (listing.find(class_="clearfix hidden-xs").find(class_="displaypanel-info")).get_text()
bedrooms = (listing.find_all("li")[2]).get_text()
bathrooms = (listing.find_all("li")[3]).get_text()
square_feet = (listing.find_all("li")[4]).get_text()
csv_writer.writerow([title, type, price, location, bedrooms, bathrooms, square_feet, link])
next_btn = soup.find(class_="paginator-next_page paginator-control")
url = next_btn.find("a")["href"] if "href" else None
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.