I am trying to scrape a table from multiple pages for different weeks, however I keep on getting the results from this url https://www.boxofficemojo.com/weekly/2018W52/ , here's the code I am using:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from time import sleep
from random import randint
import re
pages = np.arange(2015,2016)
week = ['01','02','03','04','05','06','07','08','09']
week1 = np.arange(10,11)
for x in week1:
week.append(x)
week
mov = soup.find_all("table", attrs={"class": "a-bordered"})
print("Number of tables on site: ",len(mov))
all_rows= []
all_rows= []
for page in pages:
for x in week:
url = requests.get('https://www.boxofficemojo.com/weekly/'+str(page)+'W'+str(x)+'/')
soup = BeautifulSoup(url.text, 'lxml')
mov = soup.find_all("table", attrs={"class": "a-bordered"})
table1 = mov[0]
body = table1.find_all("tr")
head = body[0]
body_rows = body[1:]
sleep(randint(2,10))
for row_num in range(len(body_rows)):
row = []
for row_item in body_rows[row_num].find_all("td"):
aa = re.sub("(\xa0)|(\n)|,","",row_item.text)
row.append(aa)
all_rows.append(row)
print('Page', page, x)
Assuming you want 52 weeks from each year why not generate the links in advance then use pandas to retrieve the table, create a list of such dataframes and concatenate those into a final dataframe?
import pandas as pd
def get_table(url):
year = int(url[37:41])
week_yr = int(url[42:44])
df = pd.read_html(url)[0]
df['year'] = year
df['week_yr'] = week_yr
return df
years = ['2015','2016']
weeks = [str(i).zfill(2) for i in range(1, 53)]
base = 'https://www.boxofficemojo.com/weekly'
urls = [f'{base}/{year}W{week}' for week in weeks for year in years]
results = pd.concat([get_table(url, int(url.split('/')[-1][:4])) for url in urls])
You might then look at ways of speeding things up eg
from multiprocessing import Pool, cpu_count
import pandas as pd
def get_table(url):
year = int(url[37:41])
week_yr = int(url[42:44])
df = pd.read_html(url)[0]
df['year'] = year
df['week_yr'] = week_yr
return df
if __name__ == '__main__':
years = ['2015','2016']
weeks = [str(i).zfill(2) for i in range(1, 53)]
base = 'https://www.boxofficemojo.com/weekly'
urls = [f'{base}/{year}W{week}' for week in weeks for year in years]
with Pool(cpu_count()-1) as p:
results = p.map(get_table, urls)
final = pd.concat(results)
print(final)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.