繁体   English   中英

使用 python、csv、beautifulsoup 和 Z251D2BBFE9A3B95E5691CEB30DC6 进行网页抓取和分页

[英]web-scraping and pagination with python, csv, beautifulsoup and Pandas

这个网站https://aviation-safety.net/wikibase/数据库从 1902 年到 2022 年开始。这里提供的代码也记录了一些年的遗漏。 未捕获 1912 年之前和 2021 年之后的年份。 我想为所有类型的飞机按年份刮取所有事故。 This webDB starts from https://aviation-safety.net/wikibase/dblist.php?Year=1902 and should end on https://aviation-safety.net/wikibase/dblist.php?Year=2022 . 目前,代码将结果转储在.csv文件中,但也可能在 SQLite 中。

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures

def scrape_year(year):
    # use a default looking header to cover my tracks in case they block requests that don't have "accept" and "user-agent" which sometimes happens
    headers =   {
        'accept':'*/*',
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
        }

    url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page=1'
    req = requests.get(url, headers=headers)

    soup = BeautifulSoup(req.text,'html.parser')

    page_container = soup.find('div',{'class':'pagenumbers'})
    pages = max([int(page['href'].split('=')[-1]) for page in  page_container.find_all('a')])   # get the maximum number of pages using "list comprehension", I get all the links at the bottom of the page ('a' tags) and the get the [href] for each, but split it on "=" making each a list, then get the last one ([-1]) and turn the text into an integer so I can get the max of all the integers ie the last page number

    info = []
    for page in range(1,pages+1):

        new_url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page={page}'
        print(new_url)

        data = requests.get(new_url,headers=headers)
        soup = BeautifulSoup(data.text,'html.parser')


        table = soup.find('table',{'class':'hp'})


        regex = re.compile('list.*')
        for index,row in enumerate(table.find_all('tr',{'class':regex})):
            if index == 0:
                continue

            acc_link = 'https://aviation-safety.net/'+row.find('a')['href']
            try:
                acc_date = datetime.strptime(row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
            except ValueError:
                try:
                    acc_date = datetime.strptime("01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
                except ValueError:
                    try:
                        acc_date = datetime.strptime("01-01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
                    except ValueError:
                        continue

            acc_type = row.find_all('td')[1].text
            acc_reg = row.find_all('td')[2].text
            acc_operator = row.find_all('td')[3].text
            acc_fat = row.find_all('td')[4].text
            acc_location = row.find_all('td')[5].text
            acc_dmg = row.find_all('td')[7].text

            item = {
                'acc_link' : acc_link,
                'acc_date': acc_date,
                'acc_type': acc_type,
                'acc_reg': acc_reg,
                'acc_operator' :acc_operator,
                'acc_fat':acc_fat,
                'acc_location':acc_location,
                'acc_dmg':acc_dmg
                }

            info.append(item)

    df= pd.DataFrame(info)
    df.to_csv(f'{year}_aviation-safety.csv', encoding='utf-8-sig', index=False)


if __name__ == "__main__":

    START = 1901
    STOP = 2023

    years = [year for year in range(START,STOP+1)]

    print(f'Scraping {len(years)} years of data')

    with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
            final_list = executor.map(scrape_year,years)

Lmao,我曾经在这个网站上为某人写过这段代码。 我已经编辑为在这里失踪的几年工作:

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures

def scrape_year(year):

    try:
        headers =   {
            'accept':'*/*',
            'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
            }

        url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page=1'
        req = requests.get(url, headers=headers)

        soup = BeautifulSoup(req.text,'html.parser')

        page_container = soup.find('div',{'class':'pagenumbers'})

        try:
            pages = max([int(page['href'].split('=')[-1]) for page in  page_container.find_all('a')])
        except:
            pages = 1

        info = []
        for page in range(1,pages+1):

            new_url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page={page}'
            print(new_url)

            data = requests.get(new_url,headers=headers)
            soup = BeautifulSoup(data.text,'html.parser')


            table = soup.find('table',{'class':'hp'})


            regex = re.compile('list.*')
            for index,row in enumerate(table.find_all('tr',{'class':regex})):
                if index == 0:
                    continue

                acc_link = 'https://aviation-safety.net/'+row.find('a')['href']
                try:
                    acc_date = datetime.strptime(row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
                except ValueError:
                    try:
                        acc_date = datetime.strptime("01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
                    except ValueError:
                        try:
                            acc_date = datetime.strptime("01-01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
                        except ValueError:
                            continue

                acc_type = row.find_all('td')[1].text
                acc_reg = row.find_all('td')[2].text
                acc_operator = row.find_all('td')[3].text
                acc_fat = row.find_all('td')[4].text
                acc_location = row.find_all('td')[5].text
                acc_dmg = row.find_all('td')[7].text

                item = {
                    'acc_link' : acc_link,
                    'acc_date': acc_date,
                    'acc_type': acc_type,
                    'acc_reg': acc_reg,
                    'acc_operator' :acc_operator,
                    'acc_fat':acc_fat,
                    'acc_location':acc_location,
                    'acc_dmg':acc_dmg
                    }

                info.append(item)

        df= pd.DataFrame(info)
        df.to_csv(f'{year}_aviation-safety.csv',index=False)

    except Exception as e:
        print(e, url)



if __name__ == "__main__":

    START = 1902
    STOP = 2022

    years = [year for year in range(START,STOP+1)]

    print(f'Scraping {len(years)} years of data')

    with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
            final_list = executor.map(scrape_year,years)

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM