繁体   English   中英

调整python(美汤)代码刮多页

[英]Adjust python (beautiful soup) code to scrape multiple pages

非常感谢您的支持,我正在使用 Python BeautifulSoup:

我需要简单地在多个页面上运行此代码(也就是在第 1 到 1290 页上抓取相同的数据)。 我是新手,我可以想象它并没有那么复杂,因为 URL 的页码非常简单

 from bs4 import BeautifulSoup as soup from urllib.request import urlopen as uReq my_url = 'https://www.propertyfinder.eg/en/search?c=1&ob=mr&page=1&t=3000' #Opening the connection and grabbing the page uClient = uReq(my_url) #offload page content into a variable page_html = uClient.read() uClient.close() #html parsing page_soup = soup(page_html, "html.parser") cards = page_soup.findAll("div",{"class":"card__content"}) contain_cards = cards[0] #file creation filename = "propertyfinder.csv" f = open(filename, "w") headers = "title,address,area,bedrooms,bathrooms,price\n" f.write(headers) ##DATA for contain_cards in cards: #TITLE title_container = contain_cards.findAll("h2",{"class":"card__title card__title-link"}) title = title_container[0].text #ADDRESS address_container = contain_cards.findAll("span",{"class":"card__location-text"}) address = address_container[0].text #PRICE price_container = contain_cards.findAll("span",{"class":"card__price-value"}) price = (price_container[0].text.strip()).replace("EGP","") #BEDROOMS bedrooms_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--bedrooms"}) bedrooms = bedrooms_container[0].text.strip() #BATHROOMS bathrooms_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--bathrooms"}) bathrooms = bathrooms_container[0].text.strip() #AREA area_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--area"}) area = area_container[0].text #CLOSING print (title) print (address) print (area) print (bedrooms) print (bathrooms) print (price) f.write(title.replace(",","|") + "," + address.replace(",","|") + "," + area + "," + bedrooms + "," + bathrooms + "," + price.replace(",","") + "\n" ) f.close()

尝试这样的事情:

from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq

# file creation
num = 1
filename = "propertyfinder.csv"
with open(filename, 'w') as f:
    headers = "title,address,area,bedrooms,bathrooms,price\n"
    f.write(headers)
    while True:
        my_url = f'https://www.propertyfinder.eg/en/search?c=1&ob=mr&page={num}&t=3000'

        uClient = uReq(my_url)

        page_html = uClient.read()
        uClient.close()

        page_soup = soup(page_html, "html.parser")
        cards = page_soup.findAll("div", {"class": "card__content"})
        contain_cards = cards[0]
        try:
            for contain_cards in cards:
                # TITLE
                title_container = contain_cards.findAll("h2", {"class": "card__title card__title-link"})
                title = title_container[0].text

                # ADDRESS
                address_container = contain_cards.findAll("span", {"class": "card__location-text"})
                address = address_container[0].text

                # PRICE
                price_container = contain_cards.findAll("span", {"class": "card__price-value"})
                price = (price_container[0].text.strip()).replace("EGP", "")

                # BEDROOMS
                bedrooms_container = contain_cards.findAll("p",
                                                           {"class": "card__property-amenity card__property-amenity--bedrooms"})
                bedrooms = bedrooms_container[0].text.strip()

                # BATHROOMS
                bathrooms_container = contain_cards.findAll("p",
                                                            {"class": "card__property-amenity card__property-amenity--bathrooms"})
                bathrooms = bathrooms_container[0].text.strip()

                # AREA
                area_container = contain_cards.findAll("p", {"class": "card__property-amenity card__property-amenity--area"})
                area = area_container[0].text

                # CLOSING
                print(title)
                print(address)
                print(area)
                print(bedrooms)
                print(bathrooms)
                print(price)
                f.write(title.replace(",", "|") + "," + address.replace(",",
                                                                        "|") + "," + area + "," + bedrooms + "," + bathrooms + "," + price.replace(
                    ",", "") + "\n")
        except:
            pass
        num+=1
        if num > 1290:
            break

请注意,我通过tryexcept绕过了一些UnicodeEncodeError ,但我告诉您如何在脚本中运行多页

想出来如下供大家参考:

 from bs4 import BeautifulSoup import requests def scrape_properties(page): my_url = f'https://www.propertyfinder.eg/en/search?c=1&ob=mr&page={page}&t=3000' #Opening the connection and grabbing the page headers = { 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36', 'referer': 'https://google.com', } response = requests.get(my_url, headers=headers) #html parsing page_soup = BeautifulSoup(response.text, "html.parser") cards = page_soup.find_all("div",{"class":"card__content"}) contain_cards = cards[0] #file creation filename = "propertyfinder.csv" if page == 1: f = open(filename, "w") headers = "title,address,area,bedrooms,bathrooms,price,ptype\n" f.write(headers) else: f = open(filename, "a") ##DATA for contain_cards in cards: try: #TITLE title_container = contain_cards.find_all("h2",{"class":"card__title card__title-link"}) title = title_container[0].text.strip() #ADDRESS address_container = contain_cards.find_all("span",{"class":"card__location-text"}) address = address_container[0].text.strip() #PRICE price_container = contain_cards.find_all("span",{"class":"card__price-value"}) price = (price_container[0].text.strip()).replace("EGP","").strip() #BEDROOMS bedrooms_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--bedrooms"}) bedrooms = bedrooms_container[0].text.strip().strip() #BATHROOMS bathrooms_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--bathrooms"}) bathrooms = bathrooms_container[0].text.strip() #AREA area_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--area"}) area = area_container[0].text.strip() #PTYPE ptype_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--property-type"}) ptype = ptype_container[0].text.strip() #CLOSING print (title) print (address) print (area) print (bedrooms) print (bathrooms) print (price) print (ptype) f.write(title.replace(",","|") + "," + address.replace(",","|") + "," + area.replace(",","") + "," + bedrooms + "," + bathrooms + "," + price.replace(",","") + "," + ptype + "\n" ) except: pass f.close() for page in range(1, 100): scrape_properties(page)

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM