如何从页面中抓取表格并使用 python 创建多列数据框？

Question

这个网站https://aviation-safety.net/wikibase/ DB 从 1902 年开始到 2022 年。我正在尝试抓取 2015 年和 2016 年每起事故的表格、叙述、可能原因和分类： https:// aviation-safety.net/database/dblist.php?Year=2015 。 使用以下代码，我只能抓取表格：

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
import itertools
from random import randint
from time import sleep

def scraping(year):


    headers =   {
        'accept':'*/*',
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
        }

    url = f'https://aviation-safety.net/database/dblist.php?Year={year}&sorteer=datekey&page=1'
    #sleep(randint(1,3))
    req = requests.get(url, headers=headers)

    soup = BeautifulSoup(req.text,'html.parser')

    page_container = soup.find('div',{'class':'pagenumbers'})

    pages = max([int(page['href'].split('=')[-1]) for page in  page_container.find_all('a')])
        

    #info = []
    tl = []
    for page in range(1,pages+1):

        new_url = f'https://aviation-safety.net/database/dblist.php?Year={year}&lang=&page={page}'
        print(new_url)
        
        #sleep(randint(1,3))
        data = requests.get(new_url,headers=headers)
        soup = BeautifulSoup(data.text,'html.parser')


        table = soup.find('table')
   
    
        for index,row in enumerate(table.find_all('tr')):
            if index == 0:
                continue

            link_ = 'https://aviation-safety.net/'+row.find('a')['href']
            
            #sleep(randint(1,3))
            new_page = requests.get(link_, headers=headers)
            new_soup = BeautifulSoup(new_page.text, 'lxml')
            table1 = new_soup.find('table')
            
           
            for i in table1.find_all('tr'):
                title = i.text
                tl.append(title)
                
                
    df= pd.DataFrame(tl)
    df.columns = ['status'] 
    df.to_csv(f'{year}_aviation-safety_new.csv', encoding='utf-8-sig', index=False)    
          

if __name__ == "__main__":

    START = 2015
    STOP = 2016

    years = [year for year in range(START,STOP+1)]

    print(f'Scraping {len(years)} years of data')

    with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
        final_list = executor.map(scraping,years)

但是数据没有组织。 数据框如下所示：

结果应该是这样的：

Answer 1

看起来tl的值是字符串，例如'Status:Accident investigation report completed and information captured' 。

将字符串列表转换为pd.DataFrame会得到一个包含列表中所有值的列。

如果您想使用字符串的“名称”，例如Status作为列标题，您需要将它与其余文本分开。

# maxsplit of 1 so we don't accidentally split up the values, e.g. time
title, text = title.split(":", maxsplit=1)

这看起来像

('Status', 'Accident investigation report completed and information captured')

现在我们创建一个字典

row_dict[title] = text

给我们

{'Status': 'Accident investigation report completed and information captured'}

我们将在最后一个循环中添加到同一个字典

# old
for i in table1.find_all('tr'):
    title = i.text
    tl.append(title)

# new
row_dict = {}
for i in table1.find_all('tr'):
    title = i.text
    title, text = title.split(":", maxsplit=1)
    row_dict[title] = text

在我们从页面收集了所有数据之后，即完成了row_dict循环，我们追加到tl 。

row_dict = {}
for i in table1.find_all('tr'):
    title = i.text
    title, text = title.split(":", maxsplit=1)
    row_dict[title] = text

tl.append(row_dict)

现在都在一起了

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
import itertools
from random import randint
from time import sleep

def scraping(year):


    headers =   {
        'accept':'*/*',
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
        }

    url = f'https://aviation-safety.net/database/dblist.php?Year={year}&sorteer=datekey&page=1'
    #sleep(randint(1,3))
    req = requests.get(url, headers=headers)

    soup = BeautifulSoup(req.text,'html.parser')

    page_container = soup.find('div',{'class':'pagenumbers'})

    pages = max([int(page['href'].split('=')[-1]) for page in  page_container.find_all('a')])
        

    #info = []
    tl = []
    for page in range(1,pages+1):

        new_url = f'https://aviation-safety.net/database/dblist.php?Year={year}&lang=&page={page}'
        print(new_url)
        
        #sleep(randint(1,3))
        data = requests.get(new_url,headers=headers)
        soup = BeautifulSoup(data.text,'html.parser')


        table = soup.find('table')
   
    
        for index,row in enumerate(table.find_all('tr')):
            if index == 0:
                continue

            link_ = 'https://aviation-safety.net/'+row.find('a')['href']
            
            #sleep(randint(1,3))
            new_page = requests.get(link_, headers=headers)
            new_soup = BeautifulSoup(new_page.text, 'lxml')
            table1 = new_soup.find('table')
            
            # make changes here!!!!!!!
            row_dict = {}
            for i in table1.find_all('tr'):
                title = i.text
                title, text = title.split(":", maxsplit=1)
                row_dict[title] = text
            
            tl.append(row_dict)
                
    df= pd.DataFrame(tl)
    df.to_csv(f'{year}_aviation-safety_new.csv', encoding='utf-8-sig', index=False)    
          

if __name__ == "__main__":

    START = 2015
    STOP = 2016

    years = [year for year in range(START,STOP+1)]

    print(f'Scraping {len(years)} years of data')

    with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
        final_list = executor.map(scraping,years)

Answer 2

read_html()方法提供了对此类数据集的便捷访问。

>>> url = "https://web.archive.org/web/20221027040903/https://aviation-safety.net/database/dblist.php?Year=2015"
>>>
>>> dfs = pd.read_html(url)
>>>
>>> df = dfs[1].drop(columns="operator").dropna(axis=1, how="all")
>>> df["date"] = pd.to_datetime(df.date.str.replace("??-", "01-", regex=False), format="%d-%b-%Y")
>>> df.set_index("date")
                                 type registration  fat.              location cat
date                                                                              
2015-01-02                  Saab 340B       G-LGNL     0       Stornoway Ai...  A1
2015-01-03         Antonov An-26B-100     RA-26082     0       Magadan-Soko...  A1
2015-01-04                  Fokker 50       5Y-SIB     0       Nairobi-Jomo...  A1
2015-01-08  Bombardier Challenger 300       PR-YOU     0       São Paulo-Co...  O1
2015-01-09  Cessna 208B Grand Caravan       8R-GAB     0       Matthews Rid...  A2
...                               ...          ...   ...                   ...  ..
2015-06-11                Eclipse 500       N508JA     0       Sacramento-E...  A2
2015-06-11               Hawker 800XP       N497AG     0       Port Harcour...  A1
2015-06-12             Boeing 737-33A       VH-NLK     0  near Kosrae Airpo...  I2
2015-06-15              Antonov An-2R     RA-84553     0       Tatsinsky di...  A1
2015-06-16        Boeing 737-322 (WL)       LY-FLB     0       Aktau Airpor...  O1

[100 rows x 5 columns]

用户代理标头很难控制，因此要么使用协作站点，要么对请求做一些额外的工作，或者预先使用 curl 获取 html 文本。

如何从页面中抓取表格并使用 python 创建多列数据框？

问题描述

2 个解决方案

解决方案1
2 已采纳 2022-12-22 21:05:26

解决方案2
1 2022-12-22 21:23:06

如何从页面中抓取表格并使用 python 创建多列数据框？

问题描述

2 个解决方案

解决方案1 2 已采纳 2022-12-22 21:05:26

解决方案2 1 2022-12-22 21:23:06

解决方案1
2 已采纳 2022-12-22 21:05:26

解决方案2
1 2022-12-22 21:23:06