I am beginner to python, I have been trying to do data scrape imdb movies but it doesn't worked out. then, I try linked the data I scrape to save it as CSV file but it only produce blank sheet.
from bs4 import BeautifulSoup
import requests,csv
source = requests.get('http://www.imdb.com/chart/top')
soup = BeautifulSoup(source.text, 'html.parser')
movies = soup.find('tbody', class_="lister-list").find_all('tr')
for movie in movies:
name = movie.find('td',class_="titleColumn").a.text
rank = movie.find('td',class_="titleColumn").get_text(strip=True).split('.')[0]
year = movie.find('td',class_="titleColumn").span.text.strip('()')
rating = movie.find('td',class_="ratingColumn imdbRating").strong.text
print(rank,name,year,rating)
import files
data.to_csv('sample.csv')
files.download("sample.csv")
You're not writing to csv in any part of your code. This would be the way to go (taking into account the colab environment):
from bs4 import BeautifulSoup
from google.colab import files
import requests, csv
source = requests.get('http://www.imdb.com/chart/top')
soup = BeautifulSoup(source.text, 'html.parser')
movies = soup.find('tbody', class_="lister-list").find_all('tr')
with open('sample.csv', 'w', newline='') as csvfile:
fieldnames=['rank','name','year','rating']
wtr = csv.DictWriter(csvfile, fieldnames=fieldnames)
wtr.writeheader()
for movie in movies[0:10]:
name = movie.find('td',class_="titleColumn").a.text
rank = movie.find('td',class_="titleColumn").get_text(strip=True).split('.')[0]
year = movie.find('td',class_="titleColumn").span.text.strip('()')
rating = movie.find('td',class_="ratingColumn imdbRating").strong.text
print(rank,name,year,rating)
wtr.writerow({'rank':rank,'name':name,'year':year,'rating':rating})
files.download('sample.csv')
Here's an efficient way using a popular data science library
from bs4 import BeautifulSoup
import requests
import pandas as pd #import pandas here
source = requests.get('http://www.imdb.com/chart/top')
soup = BeautifulSoup(source.text, 'html.parser')
data = pd.DataFrame() #define a dataframe
movies = soup.find('tbody', class_="lister-list").find_all('tr')
for movie in movies:
name = movie.find('td',class_="titleColumn").a.text
rank = movie.find('td',class_="titleColumn").get_text(strip=True).split('.')[0]
year = movie.find('td',class_="titleColumn").span.text.strip('()')
rating = movie.find('td',class_="ratingColumn imdbRating").strong.text
_movie = pd.DataFrame({
'name': [name],
'rank': [rank],
'year': [year],
'rating': [rating]
)}
data=data.append(movie) # append each movie to dataframe
print(rank,name,year,rating)
data.to_csv('sample.csv') # save dataframe to csv
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.