[英]How to scrape a table from a page and create a multi-column dataframe with python?
这个网站https://aviation-safety.net/wikibase/ DB 从 1902 年开始到 2022 年。我正在尝试抓取 2015 年和 2016 年每起事故的表格、叙述、可能原因和分类: https:// aviation-safety.net/database/dblist.php?Year=2015 。 使用以下代码,我只能抓取表格:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
import itertools
from random import randint
from time import sleep
def scraping(year):
headers = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
url = f'https://aviation-safety.net/database/dblist.php?Year={year}&sorteer=datekey&page=1'
#sleep(randint(1,3))
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text,'html.parser')
page_container = soup.find('div',{'class':'pagenumbers'})
pages = max([int(page['href'].split('=')[-1]) for page in page_container.find_all('a')])
#info = []
tl = []
for page in range(1,pages+1):
new_url = f'https://aviation-safety.net/database/dblist.php?Year={year}&lang=&page={page}'
print(new_url)
#sleep(randint(1,3))
data = requests.get(new_url,headers=headers)
soup = BeautifulSoup(data.text,'html.parser')
table = soup.find('table')
for index,row in enumerate(table.find_all('tr')):
if index == 0:
continue
link_ = 'https://aviation-safety.net/'+row.find('a')['href']
#sleep(randint(1,3))
new_page = requests.get(link_, headers=headers)
new_soup = BeautifulSoup(new_page.text, 'lxml')
table1 = new_soup.find('table')
for i in table1.find_all('tr'):
title = i.text
tl.append(title)
df= pd.DataFrame(tl)
df.columns = ['status']
df.to_csv(f'{year}_aviation-safety_new.csv', encoding='utf-8-sig', index=False)
if __name__ == "__main__":
START = 2015
STOP = 2016
years = [year for year in range(START,STOP+1)]
print(f'Scraping {len(years)} years of data')
with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
final_list = executor.map(scraping,years)
但是数据没有组织。 数据框如下所示:
结果应该是这样的:
看起来tl
的值是字符串,例如'Status:Accident investigation report completed and information captured'
。
将字符串列表转换为pd.DataFrame
会得到一个包含列表中所有值的列。
如果您想使用字符串的“名称”,例如Status
作为列标题,您需要将它与其余文本分开。
# maxsplit of 1 so we don't accidentally split up the values, e.g. time
title, text = title.split(":", maxsplit=1)
这看起来像
('Status', 'Accident investigation report completed and information captured')
现在我们创建一个字典
row_dict[title] = text
给我们
{'Status': 'Accident investigation report completed and information captured'}
我们将在最后一个循环中添加到同一个字典
# old
for i in table1.find_all('tr'):
title = i.text
tl.append(title)
# new
row_dict = {}
for i in table1.find_all('tr'):
title = i.text
title, text = title.split(":", maxsplit=1)
row_dict[title] = text
在我们从页面收集了所有数据之后,即完成了row_dict
循环,我们追加到tl
。
row_dict = {}
for i in table1.find_all('tr'):
title = i.text
title, text = title.split(":", maxsplit=1)
row_dict[title] = text
tl.append(row_dict)
现在都在一起了
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
import itertools
from random import randint
from time import sleep
def scraping(year):
headers = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
url = f'https://aviation-safety.net/database/dblist.php?Year={year}&sorteer=datekey&page=1'
#sleep(randint(1,3))
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text,'html.parser')
page_container = soup.find('div',{'class':'pagenumbers'})
pages = max([int(page['href'].split('=')[-1]) for page in page_container.find_all('a')])
#info = []
tl = []
for page in range(1,pages+1):
new_url = f'https://aviation-safety.net/database/dblist.php?Year={year}&lang=&page={page}'
print(new_url)
#sleep(randint(1,3))
data = requests.get(new_url,headers=headers)
soup = BeautifulSoup(data.text,'html.parser')
table = soup.find('table')
for index,row in enumerate(table.find_all('tr')):
if index == 0:
continue
link_ = 'https://aviation-safety.net/'+row.find('a')['href']
#sleep(randint(1,3))
new_page = requests.get(link_, headers=headers)
new_soup = BeautifulSoup(new_page.text, 'lxml')
table1 = new_soup.find('table')
# make changes here!!!!!!!
row_dict = {}
for i in table1.find_all('tr'):
title = i.text
title, text = title.split(":", maxsplit=1)
row_dict[title] = text
tl.append(row_dict)
df= pd.DataFrame(tl)
df.to_csv(f'{year}_aviation-safety_new.csv', encoding='utf-8-sig', index=False)
if __name__ == "__main__":
START = 2015
STOP = 2016
years = [year for year in range(START,STOP+1)]
print(f'Scraping {len(years)} years of data')
with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
final_list = executor.map(scraping,years)
read_html()方法提供了对此类数据集的便捷访问。
>>> url = "https://web.archive.org/web/20221027040903/https://aviation-safety.net/database/dblist.php?Year=2015"
>>>
>>> dfs = pd.read_html(url)
>>>
>>> df = dfs[1].drop(columns="operator").dropna(axis=1, how="all")
>>> df["date"] = pd.to_datetime(df.date.str.replace("??-", "01-", regex=False), format="%d-%b-%Y")
>>> df.set_index("date")
type registration fat. location cat
date
2015-01-02 Saab 340B G-LGNL 0 Stornoway Ai... A1
2015-01-03 Antonov An-26B-100 RA-26082 0 Magadan-Soko... A1
2015-01-04 Fokker 50 5Y-SIB 0 Nairobi-Jomo... A1
2015-01-08 Bombardier Challenger 300 PR-YOU 0 São Paulo-Co... O1
2015-01-09 Cessna 208B Grand Caravan 8R-GAB 0 Matthews Rid... A2
... ... ... ... ... ..
2015-06-11 Eclipse 500 N508JA 0 Sacramento-E... A2
2015-06-11 Hawker 800XP N497AG 0 Port Harcour... A1
2015-06-12 Boeing 737-33A VH-NLK 0 near Kosrae Airpo... I2
2015-06-15 Antonov An-2R RA-84553 0 Tatsinsky di... A1
2015-06-16 Boeing 737-322 (WL) LY-FLB 0 Aktau Airpor... O1
[100 rows x 5 columns]
用户代理标头很难控制,因此要么使用协作站点,要么对请求做一些额外的工作,或者预先使用 curl 获取 html 文本。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.