简体   繁体   中英

Scraping OpenTable website using python BeautifulSoup

I'm trying to scrape the Open Table site with the use of Beautiful Soup.The code runs successfully, but the result I am getting has a lot of NA columns. Here is the code.

def parse_html(html):
    data, item = pd.DataFrame(), {}
    soup = BeautifulSoup(html, 'lxml')
    for i, resto in enumerate(soup.find_all('div', class_='rest-row-info')):
        item['name'] = resto.find('span', class_='rest-row-name-text').text

        booking = resto.find('div', class_='booking')
        item['bookings'] = re.search('\d+', booking.text).group() if booking else 'NA'

        rating = resto.select('div.all-stars.filled')
        item['rating'] = int(re.search('\d+', rating[0].get('style')).group()) if rating else 'NA'

        reviews = resto.find('span', class_='star-rating-text--review-text')
        item['reviews'] = int(re.search('\d+', reviews.text).group()) if reviews else 'NA'

        item['price'] = int(resto.find('div', class_='rest-row-pricing').find('i').text.count('$'))
        item['cuisine'] = resto.find('span', class_='rest-row-meta--cuisine').text
        item['location'] = resto.find('span', class_='rest-row-meta--location').text
        data[i] = pd.Series(item)
    return data.T


restaurants = pd.DataFrame()
driver = webdriver.Chrome(ChromeDriverManager().install())
url = "https://www.opentable.com/new-york-restaurant-listings"
driver.get(url)
while True:
    sleep(1)
    new_data = parse_html(driver.page_source)
    if new_data.empty:
        break
    restaurants = pd.concat([restaurants, new_data], ignore_index=True)
    print(len(restaurants))
   # driver.find_element_by_link_text('Next').click()

driver.close()
restaurants.to_csv('results.csv', index=False)
print(restaurants)

and the results:

name    bookings    rating  reviews price   cuisine location
0   IL Carino Restaurant        1   NA  NA  3           Upper East Side
1   French Roast Uptown         10  NA  NA  3           Upper West Side
2   The Mermaid Inn Uptown      72  NA  NA  3           Upper West Side
3   Cafe Du Soleil              101 NA  NA  2           Upper West Side
4   The Leopard at des Artistes 24  NA  NA  4           Upper West Side 

Any recommendation or suggestion is appreciated.

I don't see on this page

rating = resto.select('div.all-stars.filled')

and code also can't find it - so you get NA for rating

But this gives me strings like 4.5 stars out of 5

rating = resto.select('.star-rating .star-rating-score')
#print(rating)
item['rating'] = rating[0]['aria-label'] if rating else 'NA'

I don't see on this page

resto.find('span', class_='star-rating-text--review-text')

and code also can't find it - so you get NA for reviews

But this gives me strings like Awesome , Exceptional

reviews = resto.select('div.review-rating-text span')
#print(reviews)
item['reviews'] = reviews[0].text if reviews else 'NA'

There are two elements with class 'rest-row-meta--cuisine' and you get first so you get $$$$

item['cuisine'] = resto.find('span', class_='rest-row-meta--cuisine').text

but you should use find_all to get both and later use [-1] to get last one

item['cuisine'] = resto.find_all('span', class_='rest-row-meta--cuisine')[-1].text
    

and this gives me

Pizzeria
Italian
Sushi
Steak
Contemporary Italian
Pizzeria
American
Italian
American

from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep
import re

def parse_html(html):
    data, item = pd.DataFrame(), {}
    soup = BeautifulSoup(html, 'lxml')
    for i, resto in enumerate(soup.find_all('div', class_='rest-row-info')):
        item['name'] = resto.find('span', class_='rest-row-name-text').text

        booking = resto.find('div', class_='booking')
        item['bookings'] = re.search('\d+', booking.text).group() if booking else 'NA'

        rating = resto.select('.star-rating .star-rating-score')
        #print(rating)
        item['rating'] = rating[0]['aria-label'] if rating else 'NA'

        reviews = resto.find('span', class_='star-rating-text--review-text')
        
        reviews = resto.select('div.review-rating-text span')
        #print(reviews)
        item['reviews'] = reviews[0].text if reviews else 'NA'

        item['price'] = int(resto.find('div', class_='rest-row-pricing').find('i').text.count('$'))
        
        item['cuisine'] = resto.find_all('span', class_='rest-row-meta--cuisine')[-1].text
        #print(item['cuisine'])
        
        item['location'] = resto.find('span', class_='rest-row-meta--location').text
        data[i] = pd.Series(item)
    return data.T


restaurants = pd.DataFrame()
#driver = webdriver.Chrome(ChromeDriverManager().install())
driver = webdriver.Chrome()
url = "https://www.opentable.com/new-york-restaurant-listings"
driver.get(url)

while True:
    sleep(1)
    new_data = parse_html(driver.page_source)
    if new_data.empty:
        break
    restaurants = pd.concat([restaurants, new_data], ignore_index=True)
    print(len(restaurants))
   # driver.find_element_by_link_text('Next').click()
    
#driver.close()
restaurants.to_csv('results.csv', index=False)
print(restaurants[['rating', 'reviews', 'cuisine']])

                rating      reviews               cuisine
0   4.5 stars out of 5      Awesome               Italian
1   4.5 stars out of 5      Awesome       French American
2   4.7 stars out of 5  Exceptional               Italian
3   4.8 stars out of 5  Exceptional               Seafood
4   4.4 stars out of 5      Awesome                French
..                 ...          ...                   ...
95  4.7 stars out of 5  Exceptional  Contemporary Italian
96    4 stars out of 5    Excellent              Pizzeria
97                  NA           NA              American
98  4.7 stars out of 5  Exceptional               Italian
99  4.4 stars out of 5      Awesome              American

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM