简体   繁体   中英

Scraping with Beautiful Soup and Python to CSV

Trying to scrape floor sizes (in sq ft) and lot sizes (in hectares) from listings on a real estate website using Beautiful Soup and Selenium.

The floor sizes print fine in the console

图片

but when writing to a csv file the the 'sq ft' info under the floor size column is not extracted

图片

It seems if 'sq ft' is found by BS4 in the ID element after the the one stipulated, that is returned instead and all other 'sq ft' text is passed over on every other url when writing to the csv. As you can see on ( image ) two of the listings have this, despite those two links having hectares as well:

http://property.shw.co.uk/propertyInfo/11080/145151-London-Road-Croydon--CR0-2RG http://property.shw.co.uk/propertyInfo/16162/Public-HouseRestaurant-Site-Westvale-Park-Horley-Surrey--RH6-0HJ

Can someone explain why the sq ft are printed on the console but not written to the csv? Any help would be appreciated.

Relevant HTML where CP2_CPContent_conDetails1_divDetails is relevant locator for floor sizes and lot sizes:

<div id="CP2_CPContent_conDetails1_divDetails">
                0.3 Acres <br>(0.12 Hectares)
                <div class="clear"></div>

                <div id="CP2_CPContent_conDetails1_divDes" class="divInfo">
                      Potential building size of 6,458 sq ft (600 sq m)<br>
                </div>

Code as follows:

driver = webdriver.Chrome()
shw_search_url = "http://property.shw.co.uk/searchproperties/Level2-0/Level1-0-181-236-167-165/Units/Development-or-House-and-Flat-or-Investment-or-Land-or-Office-or-Other/UnitIds-0/For-Sale"
driver.get(shw_search_url)


#identify and extract listing links from each page
def get_house_links(url, driver, pages=3):
    house_links = []
    driver.get(url)
    for i in range(pages):
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        listings = soup.find_all("a", class_="L")
        page_data = [row['href'] for row in listings]
        house_links.append(page_data)
        time.sleep(np.random.lognormal(0, 1))
        next_button = soup.select('img[src*="propNext"]')
        if next_button:
            next_button = next_button[0].find_parent('a')
            next_button_link = 'http://property.shw.co.uk' + next_button['href']
            driver.get(next_button_link)
    return house_links

#get html data from url and return as object
def get_html_data(url, driver):
    driver.get(url)
    time.sleep(np.random.lognormal(0,1))
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    return soup

def get_lot_size(soup):
    try:
        for element in soup.find_all('div', {'id':'CP2_CPContent_conDetails1_divDetails'}):
            lot_size = element.find_next(text=re.compile('Hectares'))
        lot_size = lot_size.replace("(", "").replace(")", "")
        print(lot_size)
        return lot_size
    except:
        return 'NA'

def get_floor_size(soup):
    try:
        for element in soup.find('div', {'id': 'CP2_CPContent_conDetails1_divDetails'}):
            floor_size = element.find_next(text=re.compile('sq ft'))
        print(floor_size)
        return floor_size
    except:
        return 'NA'

def flatten_list(house_links):
    house_links_flat = []
    for sublist in house_links:
        for item in sublist:
            house_links_flat.append(item)
    return house_links_flat

def get_house_data(driver, house_links_flat):
    house_data = []
    for link in house_links_flat:
        soup = get_html_data(link, driver)
        floor_size = get_floor_size(soup)
        lot_size = get_lot_size(soup)
        house_data.append([floor_size, lot_size])

    return house_data

house_links_3pages = get_house_links(shw_search_url,driver,pages=3)
house_links_flat = flatten_list(house_links_3pages)
house_data_3pages = get_house_data(driver,house_links_flat)


#open and write results to csv
file_name = "SHW %s_%s.csv" % (str(time.strftime("%Y-%m-%d")),
                           str(time.strftime("%H:%M%S")))
columns = ["Floor_Size", "Lot_Size"]
pd.DataFrame(house_data_3pages, columns = columns).to_csv(
    file_name, index = False, encoding = "UTF-8"
)

I have no problem to get Hectares with your code.

I had problem with sq ft - it doesn't even display it. All because you used find() instead of find_all() in

 for element in soup.find()

but find() doesn't return list with elements but single element and then for doesn't get this element from list but it gets probably its children and it searchs sq ft in wrong places.


from selenium import webdriver
import numpy as np
import time
import re
from bs4 import BeautifulSoup
import pandas as pd

driver = webdriver.Chrome()
shw_search_url = "http://property.shw.co.uk/searchproperties/Level2-0/Level1-0-181-236-167-165/Units/Development-or-House-and-Flat-or-Investment-or-Land-or-Office-or-Other/UnitIds-0/For-Sale"
driver.get(shw_search_url)


#identify and extract listing links from each page
def get_house_links(url, driver, pages=3):
    house_links = []
    driver.get(url)
    for i in range(pages):
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        listings = soup.find_all("a", class_="L")
        page_data = [row['href'] for row in listings]
        house_links.append(page_data)
        time.sleep(np.random.lognormal(0, 1))
        next_button = soup.select('img[src*="propNext"]')
        if next_button:
            next_button = next_button[0].find_parent('a')
            next_button_link = 'http://property.shw.co.uk' + next_button['href']
            driver.get(next_button_link)
    return house_links

#get html data from url and return as object
def get_html_data(url, driver):
    driver.get(url)
    time.sleep(np.random.lognormal(0,1))
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    return soup

def get_lot_size(soup):
    try:
        for element in soup.find_all('div', {'id':'CP2_CPContent_conDetails1_divDetails'}):
            lot_size = element.find_next(text=re.compile('Hectares'))
            if lot_size:
                lot_size = lot_size.replace("(", "").replace(")", "")
                lot_size = lot_size.strip()
            print('lot_size:', lot_size)
        return lot_size
    except Exception as ex:
        print("EX:", ex)
        return 'NA'

def get_floor_size(soup):
    try:
        for element in soup.find_all('div', {'id': 'CP2_CPContent_conDetails1_divDetails'}):
            floor_size = element.find_next(text=re.compile('sq ft'))
            if floor_size:
                floor_size = floor_size.strip()
            print('floor_size:', floor_size)
        return floor_size
    except Exception as ex:
        print("EX:", ex)
        return 'NA'

def flatten_list(house_links):
    house_links_flat = []
    for sublist in house_links:
        for item in sublist:
            house_links_flat.append(item)
    return house_links_flat

def get_house_data(driver, house_links_flat):
    house_data = []
    for link in house_links_flat:
        soup = get_html_data(link, driver)
        floor_size = get_floor_size(soup)
        lot_size = get_lot_size(soup)
        house_data.append([floor_size, lot_size])
        print('-------------------')

    return house_data

house_links_3pages = get_house_links(shw_search_url,driver,pages=3)
house_links_flat = flatten_list(house_links_3pages)
house_data_3pages = get_house_data(driver,house_links_flat)


#open and write results to csv
file_name = "SHW %s_%s.csv" % (str(time.strftime("%Y-%m-%d")),
                           str(time.strftime("%H:%M%S")))
columns = ["Floor_Size", "Lot_Size"]
pd.DataFrame(house_data_3pages, columns = columns).to_csv(
    file_name, index = False, encoding = "UTF-8"
)

CSV:

Floor_Size,Lot_Size
,0.21 Hectares
7342 sq ft,
1665 sq ft,
"The existing property extends to approximately 2,290 sq m (24,649 sq ft) GIA and sits within an L-shaped site extending to approximately 0.6 acres (0.25 hectares). Fronting London Road is a four storey commercial building, built as a garage with offices above which is currently occupied by a motor company at ground floor level, and by a church across the upper floors and basement. To the rear of the site fronting Montague Road are a number of single storey industrial buildings, currently occupied by a hand carwash. The remainder of the front forecourt and rear of the site is hard standing, predominantly used as car parking.",0.25 Hectares
4672 to 20302 sq ft,
,0.36 Hectares
,0.08 Hectares
,0.18 Hectares
2325 sq ft,
,0.02 Hectares
5288 sq ft,
0 sq ft,
,0.36 Hectares
,0.18 Hectares
"*  Potential building size of 6,458 sq ft (600 sq m)",0.12 Hectares
1258 to 5385 sq ft,
,0.13 Hectares
3600 sq ft,
,0.24 Hectares
6781 to 6871 sq ft,

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM