I'm trying to scrape the Open Table site with the use of Beautiful Soup.The code runs successfully, but the result I am getting has a lot of NA columns. Here is the code.
def parse_html(html):
data, item = pd.DataFrame(), {}
soup = BeautifulSoup(html, 'lxml')
for i, resto in enumerate(soup.find_all('div', class_='rest-row-info')):
item['name'] = resto.find('span', class_='rest-row-name-text').text
booking = resto.find('div', class_='booking')
item['bookings'] = re.search('\d+', booking.text).group() if booking else 'NA'
rating = resto.select('div.all-stars.filled')
item['rating'] = int(re.search('\d+', rating[0].get('style')).group()) if rating else 'NA'
reviews = resto.find('span', class_='star-rating-text--review-text')
item['reviews'] = int(re.search('\d+', reviews.text).group()) if reviews else 'NA'
item['price'] = int(resto.find('div', class_='rest-row-pricing').find('i').text.count('$'))
item['cuisine'] = resto.find('span', class_='rest-row-meta--cuisine').text
item['location'] = resto.find('span', class_='rest-row-meta--location').text
data[i] = pd.Series(item)
return data.T
restaurants = pd.DataFrame()
driver = webdriver.Chrome(ChromeDriverManager().install())
url = "https://www.opentable.com/new-york-restaurant-listings"
driver.get(url)
while True:
sleep(1)
new_data = parse_html(driver.page_source)
if new_data.empty:
break
restaurants = pd.concat([restaurants, new_data], ignore_index=True)
print(len(restaurants))
# driver.find_element_by_link_text('Next').click()
driver.close()
restaurants.to_csv('results.csv', index=False)
print(restaurants)
and the results:
name bookings rating reviews price cuisine location
0 IL Carino Restaurant 1 NA NA 3 Upper East Side
1 French Roast Uptown 10 NA NA 3 Upper West Side
2 The Mermaid Inn Uptown 72 NA NA 3 Upper West Side
3 Cafe Du Soleil 101 NA NA 2 Upper West Side
4 The Leopard at des Artistes 24 NA NA 4 Upper West Side
Any recommendation or suggestion is appreciated.
I don't see on this page
rating = resto.select('div.all-stars.filled')
and code also can't find it - so you get NA
for rating
But this gives me strings like 4.5 stars out of 5
rating = resto.select('.star-rating .star-rating-score')
#print(rating)
item['rating'] = rating[0]['aria-label'] if rating else 'NA'
I don't see on this page
resto.find('span', class_='star-rating-text--review-text')
and code also can't find it - so you get NA
for reviews
But this gives me strings like Awesome
, Exceptional
reviews = resto.select('div.review-rating-text span')
#print(reviews)
item['reviews'] = reviews[0].text if reviews else 'NA'
There are two elements with class 'rest-row-meta--cuisine'
and you get first so you get $$$$
item['cuisine'] = resto.find('span', class_='rest-row-meta--cuisine').text
but you should use find_all
to get both and later use [-1]
to get last one
item['cuisine'] = resto.find_all('span', class_='rest-row-meta--cuisine')[-1].text
and this gives me
Pizzeria
Italian
Sushi
Steak
Contemporary Italian
Pizzeria
American
Italian
American
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep
import re
def parse_html(html):
data, item = pd.DataFrame(), {}
soup = BeautifulSoup(html, 'lxml')
for i, resto in enumerate(soup.find_all('div', class_='rest-row-info')):
item['name'] = resto.find('span', class_='rest-row-name-text').text
booking = resto.find('div', class_='booking')
item['bookings'] = re.search('\d+', booking.text).group() if booking else 'NA'
rating = resto.select('.star-rating .star-rating-score')
#print(rating)
item['rating'] = rating[0]['aria-label'] if rating else 'NA'
reviews = resto.find('span', class_='star-rating-text--review-text')
reviews = resto.select('div.review-rating-text span')
#print(reviews)
item['reviews'] = reviews[0].text if reviews else 'NA'
item['price'] = int(resto.find('div', class_='rest-row-pricing').find('i').text.count('$'))
item['cuisine'] = resto.find_all('span', class_='rest-row-meta--cuisine')[-1].text
#print(item['cuisine'])
item['location'] = resto.find('span', class_='rest-row-meta--location').text
data[i] = pd.Series(item)
return data.T
restaurants = pd.DataFrame()
#driver = webdriver.Chrome(ChromeDriverManager().install())
driver = webdriver.Chrome()
url = "https://www.opentable.com/new-york-restaurant-listings"
driver.get(url)
while True:
sleep(1)
new_data = parse_html(driver.page_source)
if new_data.empty:
break
restaurants = pd.concat([restaurants, new_data], ignore_index=True)
print(len(restaurants))
# driver.find_element_by_link_text('Next').click()
#driver.close()
restaurants.to_csv('results.csv', index=False)
print(restaurants[['rating', 'reviews', 'cuisine']])
rating reviews cuisine
0 4.5 stars out of 5 Awesome Italian
1 4.5 stars out of 5 Awesome French American
2 4.7 stars out of 5 Exceptional Italian
3 4.8 stars out of 5 Exceptional Seafood
4 4.4 stars out of 5 Awesome French
.. ... ... ...
95 4.7 stars out of 5 Exceptional Contemporary Italian
96 4 stars out of 5 Excellent Pizzeria
97 NA NA American
98 4.7 stars out of 5 Exceptional Italian
99 4.4 stars out of 5 Awesome American
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.