from bs4 import BeautifulSoup
from urllib import request
import csv
# adding a correct user agent
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
#The url to be scraped
company_page = 'https://www.goodreads.com/list/show/6.Best_Books_of_the_20th_Century?'
#opening the page
page_request = request.Request(company_page, headers=headers)
page = request.urlopen(page_request)
#parse the html using beautiful soup
html_content = BeautifulSoup(page, 'html.parser')
#Parsing some of the title elements
title = html_content.find('div',id='shell')
print(title)
Try this to retrieve all book titles:
from bs4 import BeautifulSoup
import requests
import csv
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
company_page = 'http://www.goodreads.com/list/show/6.Best_Books_of_the_20th_Century?'
page = requests.get(company_page, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
titles = soup.find_all('a', class_="bookTitle")
for title in titles:
print(title.text)
Output:
To Kill a Mockingbird
1984
Harry Potter and the Sorcerer's Stone (Harry Potter, #1)
The Great Gatsby
Animal Farm
The Hobbit, or There and Back Again
The Diary of a Young Girl
The Little Prince
Fahrenheit 451
The Catcher in the Rye
The Lion, the Witch and the Wardrobe (Chronicles of Narnia, #1)
The Grapes of Wrath
One Hundred Years of Solitude
...
A detail: Be aware that I have changed urllib
to requests
.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.