[英]Adjust python (beautiful soup) code to scrape multiple pages
非常感谢您的支持,我正在使用 Python BeautifulSoup:
我需要简单地在多个页面上运行此代码(也就是在第 1 到 1290 页上抓取相同的数据)。 我是新手,我可以想象它并没有那么复杂,因为 URL 的页码非常简单
from bs4 import BeautifulSoup as soup from urllib.request import urlopen as uReq my_url = 'https://www.propertyfinder.eg/en/search?c=1&ob=mr&page=1&t=3000' #Opening the connection and grabbing the page uClient = uReq(my_url) #offload page content into a variable page_html = uClient.read() uClient.close() #html parsing page_soup = soup(page_html, "html.parser") cards = page_soup.findAll("div",{"class":"card__content"}) contain_cards = cards[0] #file creation filename = "propertyfinder.csv" f = open(filename, "w") headers = "title,address,area,bedrooms,bathrooms,price\n" f.write(headers) ##DATA for contain_cards in cards: #TITLE title_container = contain_cards.findAll("h2",{"class":"card__title card__title-link"}) title = title_container[0].text #ADDRESS address_container = contain_cards.findAll("span",{"class":"card__location-text"}) address = address_container[0].text #PRICE price_container = contain_cards.findAll("span",{"class":"card__price-value"}) price = (price_container[0].text.strip()).replace("EGP","") #BEDROOMS bedrooms_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--bedrooms"}) bedrooms = bedrooms_container[0].text.strip() #BATHROOMS bathrooms_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--bathrooms"}) bathrooms = bathrooms_container[0].text.strip() #AREA area_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--area"}) area = area_container[0].text #CLOSING print (title) print (address) print (area) print (bedrooms) print (bathrooms) print (price) f.write(title.replace(",","|") + "," + address.replace(",","|") + "," + area + "," + bedrooms + "," + bathrooms + "," + price.replace(",","") + "\n" ) f.close()
尝试这样的事情:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
# file creation
num = 1
filename = "propertyfinder.csv"
with open(filename, 'w') as f:
headers = "title,address,area,bedrooms,bathrooms,price\n"
f.write(headers)
while True:
my_url = f'https://www.propertyfinder.eg/en/search?c=1&ob=mr&page={num}&t=3000'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
cards = page_soup.findAll("div", {"class": "card__content"})
contain_cards = cards[0]
try:
for contain_cards in cards:
# TITLE
title_container = contain_cards.findAll("h2", {"class": "card__title card__title-link"})
title = title_container[0].text
# ADDRESS
address_container = contain_cards.findAll("span", {"class": "card__location-text"})
address = address_container[0].text
# PRICE
price_container = contain_cards.findAll("span", {"class": "card__price-value"})
price = (price_container[0].text.strip()).replace("EGP", "")
# BEDROOMS
bedrooms_container = contain_cards.findAll("p",
{"class": "card__property-amenity card__property-amenity--bedrooms"})
bedrooms = bedrooms_container[0].text.strip()
# BATHROOMS
bathrooms_container = contain_cards.findAll("p",
{"class": "card__property-amenity card__property-amenity--bathrooms"})
bathrooms = bathrooms_container[0].text.strip()
# AREA
area_container = contain_cards.findAll("p", {"class": "card__property-amenity card__property-amenity--area"})
area = area_container[0].text
# CLOSING
print(title)
print(address)
print(area)
print(bedrooms)
print(bathrooms)
print(price)
f.write(title.replace(",", "|") + "," + address.replace(",",
"|") + "," + area + "," + bedrooms + "," + bathrooms + "," + price.replace(
",", "") + "\n")
except:
pass
num+=1
if num > 1290:
break
请注意,我通过try
和except
绕过了一些UnicodeEncodeError
,但我告诉您如何在脚本中运行多页
想出来如下供大家参考:
from bs4 import BeautifulSoup import requests def scrape_properties(page): my_url = f'https://www.propertyfinder.eg/en/search?c=1&ob=mr&page={page}&t=3000' #Opening the connection and grabbing the page headers = { 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36', 'referer': 'https://google.com', } response = requests.get(my_url, headers=headers) #html parsing page_soup = BeautifulSoup(response.text, "html.parser") cards = page_soup.find_all("div",{"class":"card__content"}) contain_cards = cards[0] #file creation filename = "propertyfinder.csv" if page == 1: f = open(filename, "w") headers = "title,address,area,bedrooms,bathrooms,price,ptype\n" f.write(headers) else: f = open(filename, "a") ##DATA for contain_cards in cards: try: #TITLE title_container = contain_cards.find_all("h2",{"class":"card__title card__title-link"}) title = title_container[0].text.strip() #ADDRESS address_container = contain_cards.find_all("span",{"class":"card__location-text"}) address = address_container[0].text.strip() #PRICE price_container = contain_cards.find_all("span",{"class":"card__price-value"}) price = (price_container[0].text.strip()).replace("EGP","").strip() #BEDROOMS bedrooms_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--bedrooms"}) bedrooms = bedrooms_container[0].text.strip().strip() #BATHROOMS bathrooms_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--bathrooms"}) bathrooms = bathrooms_container[0].text.strip() #AREA area_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--area"}) area = area_container[0].text.strip() #PTYPE ptype_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--property-type"}) ptype = ptype_container[0].text.strip() #CLOSING print (title) print (address) print (area) print (bedrooms) print (bathrooms) print (price) print (ptype) f.write(title.replace(",","|") + "," + address.replace(",","|") + "," + area.replace(",","") + "," + bedrooms + "," + bathrooms + "," + price.replace(",","") + "," + ptype + "\n" ) except: pass f.close() for page in range(1, 100): scrape_properties(page)
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.