![](/img/trans.png)
[英]How to go to next page until the last page in Python Selenium when scraping website?
[英]how to get python code to loop to next page properly when scraping a website?
我正在尝试抓取一个房地产网站,但在将我的代码获取到 go 到下一页(共 25 页)时遇到问题。 目前,它只是不断地抓取第 1 页。我是一个相当大的新手,如果这是一个愚蠢的请求,我深表歉意。
import requests
from bs4 import BeautifulSoup
from csv import writer
base_url = 'https://www.rew.ca/properties/areas/kelowna-bc'
for i in range(1,26):
url = '/page/' + str(i)
while url:
response = requests.get(f"{base_url}{url}")
soup = BeautifulSoup(response.text, "html.parser")
listings = soup.find_all("article")
with open("property4.csv", "w") as csv_file:
csv_writer = writer(csv_file)
csv_writer.writerow(["title", "type", "price", "location", "bedrooms", "bathrooms", "square feet", "link"])
for listing in listings:
location = listing.find(class_="displaypanel-info").get_text().strip()
price = listing.find(class_="displaypanel-title hidden-xs").get_text().strip()
link = listing.find("a").get('href').strip()
title = listing.find("a").get('title').strip()
type = (listing.find(class_="clearfix hidden-xs").find(class_="displaypanel-info")).get_text()
bedrooms = (listing.find_all("li")[2]).get_text()
bathrooms = (listing.find_all("li")[3]).get_text()
square_feet = (listing.find_all("li")[4]).get_text()
csv_writer.writerow([title, type, price, location, bedrooms, bathrooms, square_feet, link])
next_btn = soup.find(class_="paginator-next_page paginator-control")
url = next_btn.find("a")["href"]
你的循环有两个问题。
find()
语句的缩进,使代码每页多次找到按钮,这是不必要的。
while 循环阻止您从第 1 页前进到第 2 页,因为即使您找到了下一页,url 也是如此。 只需删除它。
这是一个固定版本:
import requests
from bs4 import BeautifulSoup
from csv import writer
base_url = 'https://www.rew.ca/properties/areas/kelowna-bc'
for i in range(1,26):
url = '/page/' + str(i)
response = requests.get(f"{base_url}{url}")
soup = BeautifulSoup(response.text, "html.parser")
listings = soup.find_all("article")
#do you csv work here
next_btn = soup.find(class_="paginator-next_page paginator-control")
url = next_btn.find("a")["href"]
print(url)
为了稍微开发您的代码,然后我将 csv 逻辑分解为 function,并使用 while 循环而不是 for 循环。 这样做的好处是,如果更多列表使分页更长或更短,您不需要更新循环。
当我尝试我的代码时,我发现域要求您每 5 秒请求一页的速度不超过一页,因此我在两次抓取之间添加了 5 秒延迟
import requests
import time
from bs4 import BeautifulSoup as soup
def parse_listing(page_html):
listings = soup.find_all("article")
with open("property4.csv", "w") as csv_file:
csv_writer = writer(csv_file)
csv_writer.writerow(["title", "type", "price", "location", "bedrooms", "bathrooms", "square feet", "link"])
for listing in listings:
location = listing.find(class_="displaypanel-info").get_text().strip()
price = listing.find(class_="displaypanel-title hidden-xs").get_text().strip()
link = listing.find("a").get('href').strip()
title = listing.find("a").get('title').strip()
type = (listing.find(class_="clearfix hidden-xs").find(class_="displaypanel-info")).get_text()
bedrooms = (listing.find_all("li")[2]).get_text()
bathrooms = (listing.find_all("li")[3]).get_text()
square_feet = (listing.find_all("li")[4]).get_text()
csv_writer.writerow([title, type, price, location, bedrooms, bathrooms, square_feet, link])
prefix = 'https://www.rew.ca'
d = soup(requests.get('https://www.rew.ca/properties/areas/kelowna-bc').text, 'html.parser')
while True:
parse_listing(d)
next_page=d.find('a', {'rel': 'next'})
if next_page:
href_link=next_page.get('href')
print(href_link)
d= soup(requests.get(prefix + href_link).text, 'html.parser')
time.sleep(5)
else:
print("no more 'next page'")
break
像这样的东西应该可以工作。 它并不漂亮,但希望它可以帮助您了解它是如何在页面中旋转的。
import requests
from bs4 import BeautifulSoup
from csv import writer
import time
## use the actual base url since url returned from the page is /properties/areas/kelowna-bc/page/XX
base_url = 'https://www.rew.ca'
url = '/properties/areas/kelowna-bc/page/1'
with open("property4.csv", "w") as csv_file:
csv_writer = writer(csv_file)
csv_writer.writerow(["title", "type", "price", "location", "bedrooms", "bathrooms", "square feet", "link"])
while url:
time.sleep(5) ## not sure how slow to make this but the site will start returning 429 if you scrape too fast.
response = requests.get(f"{base_url}{url}")
print(f"{response}, {response.url}") # debugging -- helps show what page was actually requested.
response.raise_for_status() # this will raise an exception if we don't get a 200 returned.
soup = BeautifulSoup(response.text, "html.parser")
listings = soup.find_all("article")
for listing in listings:
location = listing.find(class_="displaypanel-info").get_text().strip().split() ## you'll need to decide how to handle these
price = listing.find(class_="displaypanel-title hidden-xs").get_text().strip()
link = listing.find("a").get('href').strip()
title = listing.find("a").get('title').strip()
type = (listing.find(class_="clearfix hidden-xs").find(class_="displaypanel-info")).get_text()
# not all listings contain bathroom and squarefootage
parts = listing.find_all("li")
bedrooms = (parts[2]).get_text() if len(parts) >= 3 else None
bathrooms = (parts[3]).get_text() if len(parts) >= 4 else None
square_feet = (parts[4]).get_text() if len(parts) >= 5 else None
csv_writer.writerow([title, type, price, location, bedrooms, bathrooms, square_feet, link])
print(f"{title:<45} {type:<15} {price:<10} bath {bathrooms} Sqft {square_feet}")
next_btn = soup.find(class_="paginator-next_page paginator-control")
next_link = next_btn.find("a")
url = next_link['href'] if next_link else None
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.