用 Beautiful Soup 和 Python 抓取到 CSV

Question

嘗試使用 Beautiful Soup 和 Selenium 從房地產網站上的列表中抓取樓層面積（以平方英尺為單位）和地塊面積（以公頃為單位）。

地板尺寸在控制台中打印得很好

但是當寫入 csv 文件時，地板尺寸列下的“平方英尺”信息不會被提取

似乎如果 BS4 在規定的元素之后在 ID 元素中找到“平方英尺”，則返回該元素，並且在寫入 csv 時，所有其他“平方英尺”文本都會在每個其他 url 上傳遞。 正如您在（圖片）上看到的，盡管這兩個鏈接也有公頃，但其中兩個列表都有這個：

http://property.shw.co.uk/propertyInfo/11080/145151-London-Road-Croydon--CR0-2RG http://property.shw.co.uk/propertyInfo/16162/Public-HouseRestaurant-Site- Westvale-Park-Horley-Surrey--RH6-0HJ

有人可以解釋為什么 sq ft 打印在控制台上但沒有寫入 csv 嗎？ 任何幫助，將不勝感激。

相關 HTML，其中 CP2_CPContent_conDetails1_divDetails 是樓層大小和批量大小的相關定位器：

<div id="CP2_CPContent_conDetails1_divDetails">
                0.3 Acres <br>(0.12 Hectares)
                <div class="clear"></div>

                <div id="CP2_CPContent_conDetails1_divDes" class="divInfo">
                      Potential building size of 6,458 sq ft (600 sq m)<br>
                </div>

代碼如下：

driver = webdriver.Chrome()
shw_search_url = "http://property.shw.co.uk/searchproperties/Level2-0/Level1-0-181-236-167-165/Units/Development-or-House-and-Flat-or-Investment-or-Land-or-Office-or-Other/UnitIds-0/For-Sale"
driver.get(shw_search_url)


#identify and extract listing links from each page
def get_house_links(url, driver, pages=3):
    house_links = []
    driver.get(url)
    for i in range(pages):
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        listings = soup.find_all("a", class_="L")
        page_data = [row['href'] for row in listings]
        house_links.append(page_data)
        time.sleep(np.random.lognormal(0, 1))
        next_button = soup.select('img[src*="propNext"]')
        if next_button:
            next_button = next_button[0].find_parent('a')
            next_button_link = 'http://property.shw.co.uk' + next_button['href']
            driver.get(next_button_link)
    return house_links

#get html data from url and return as object
def get_html_data(url, driver):
    driver.get(url)
    time.sleep(np.random.lognormal(0,1))
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    return soup

def get_lot_size(soup):
    try:
        for element in soup.find_all('div', {'id':'CP2_CPContent_conDetails1_divDetails'}):
            lot_size = element.find_next(text=re.compile('Hectares'))
        lot_size = lot_size.replace("(", "").replace(")", "")
        print(lot_size)
        return lot_size
    except:
        return 'NA'

def get_floor_size(soup):
    try:
        for element in soup.find('div', {'id': 'CP2_CPContent_conDetails1_divDetails'}):
            floor_size = element.find_next(text=re.compile('sq ft'))
        print(floor_size)
        return floor_size
    except:
        return 'NA'

def flatten_list(house_links):
    house_links_flat = []
    for sublist in house_links:
        for item in sublist:
            house_links_flat.append(item)
    return house_links_flat

def get_house_data(driver, house_links_flat):
    house_data = []
    for link in house_links_flat:
        soup = get_html_data(link, driver)
        floor_size = get_floor_size(soup)
        lot_size = get_lot_size(soup)
        house_data.append([floor_size, lot_size])

    return house_data

house_links_3pages = get_house_links(shw_search_url,driver,pages=3)
house_links_flat = flatten_list(house_links_3pages)
house_data_3pages = get_house_data(driver,house_links_flat)


#open and write results to csv
file_name = "SHW %s_%s.csv" % (str(time.strftime("%Y-%m-%d")),
                           str(time.strftime("%H:%M%S")))
columns = ["Floor_Size", "Lot_Size"]
pd.DataFrame(house_data_3pages, columns = columns).to_csv(
    file_name, index = False, encoding = "UTF-8"
)

Answer 1

我可以用你的代碼獲取Hectares 。

我的sq ft有問題 - 它甚至不顯示它。 都是因為你使用了find()而不是find_all()

 for element in soup.find()

但是find()不會返回包含元素的列表，而是返回單個元素，然后for不會從列表中獲取此元素，但它可能會獲取其子元素，並在錯誤的位置搜索sq ft 。

from selenium import webdriver
import numpy as np
import time
import re
from bs4 import BeautifulSoup
import pandas as pd

driver = webdriver.Chrome()
shw_search_url = "http://property.shw.co.uk/searchproperties/Level2-0/Level1-0-181-236-167-165/Units/Development-or-House-and-Flat-or-Investment-or-Land-or-Office-or-Other/UnitIds-0/For-Sale"
driver.get(shw_search_url)


#identify and extract listing links from each page
def get_house_links(url, driver, pages=3):
    house_links = []
    driver.get(url)
    for i in range(pages):
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        listings = soup.find_all("a", class_="L")
        page_data = [row['href'] for row in listings]
        house_links.append(page_data)
        time.sleep(np.random.lognormal(0, 1))
        next_button = soup.select('img[src*="propNext"]')
        if next_button:
            next_button = next_button[0].find_parent('a')
            next_button_link = 'http://property.shw.co.uk' + next_button['href']
            driver.get(next_button_link)
    return house_links

#get html data from url and return as object
def get_html_data(url, driver):
    driver.get(url)
    time.sleep(np.random.lognormal(0,1))
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    return soup

def get_lot_size(soup):
    try:
        for element in soup.find_all('div', {'id':'CP2_CPContent_conDetails1_divDetails'}):
            lot_size = element.find_next(text=re.compile('Hectares'))
            if lot_size:
                lot_size = lot_size.replace("(", "").replace(")", "")
                lot_size = lot_size.strip()
            print('lot_size:', lot_size)
        return lot_size
    except Exception as ex:
        print("EX:", ex)
        return 'NA'

def get_floor_size(soup):
    try:
        for element in soup.find_all('div', {'id': 'CP2_CPContent_conDetails1_divDetails'}):
            floor_size = element.find_next(text=re.compile('sq ft'))
            if floor_size:
                floor_size = floor_size.strip()
            print('floor_size:', floor_size)
        return floor_size
    except Exception as ex:
        print("EX:", ex)
        return 'NA'

def flatten_list(house_links):
    house_links_flat = []
    for sublist in house_links:
        for item in sublist:
            house_links_flat.append(item)
    return house_links_flat

def get_house_data(driver, house_links_flat):
    house_data = []
    for link in house_links_flat:
        soup = get_html_data(link, driver)
        floor_size = get_floor_size(soup)
        lot_size = get_lot_size(soup)
        house_data.append([floor_size, lot_size])
        print('-------------------')

    return house_data

house_links_3pages = get_house_links(shw_search_url,driver,pages=3)
house_links_flat = flatten_list(house_links_3pages)
house_data_3pages = get_house_data(driver,house_links_flat)


#open and write results to csv
file_name = "SHW %s_%s.csv" % (str(time.strftime("%Y-%m-%d")),
                           str(time.strftime("%H:%M%S")))
columns = ["Floor_Size", "Lot_Size"]
pd.DataFrame(house_data_3pages, columns = columns).to_csv(
    file_name, index = False, encoding = "UTF-8"
)

CSV：

Floor_Size,Lot_Size
,0.21 Hectares
7342 sq ft,
1665 sq ft,
"The existing property extends to approximately 2,290 sq m (24,649 sq ft) GIA and sits within an L-shaped site extending to approximately 0.6 acres (0.25 hectares). Fronting London Road is a four storey commercial building, built as a garage with offices above which is currently occupied by a motor company at ground floor level, and by a church across the upper floors and basement. To the rear of the site fronting Montague Road are a number of single storey industrial buildings, currently occupied by a hand carwash. The remainder of the front forecourt and rear of the site is hard standing, predominantly used as car parking.",0.25 Hectares
4672 to 20302 sq ft,
,0.36 Hectares
,0.08 Hectares
,0.18 Hectares
2325 sq ft,
,0.02 Hectares
5288 sq ft,
0 sq ft,
,0.36 Hectares
,0.18 Hectares
"*  Potential building size of 6,458 sq ft (600 sq m)",0.12 Hectares
1258 to 5385 sq ft,
,0.13 Hectares
3600 sq ft,
,0.24 Hectares
6781 to 6871 sq ft,

用 Beautiful Soup 和 Python 抓取到 CSV

問題描述

1 個解決方案

解決方案1
1 已采納 2020-01-04 01:23:50

用 Beautiful Soup 和 Python 抓取到 CSV

問題描述

1 個解決方案

解決方案1 1 已采納 2020-01-04 01:23:50

解決方案1
1 已采納 2020-01-04 01:23:50