簡體   English   中英

python和beautifulsoup刮足球

[英]python and beautifulsoup scraping football

正在看一個足球抓取python程序,在stackoverflow上發現了這個適合我的需求, Python Beautifulsoup4網站解析

我想要做的是修改它以只返回特定日期或一組日期的分數,只是無法弄清楚如何做到這一點。 但這只是返回每個日期的每個結果。 Thx, 馬爾

from bs4 import BeautifulSoup
import urllib2
import csv

url = 'http://www.bbc.co.uk/sport/football/fa-cup/results'
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page)

league =  'FA'

for games in soup.find_all('table', class_='table-stats'):
    played_tag = games.find('caption')
    played = played_tag and ''.join(played_tag.stripped_strings)
    print played

    for match in soup.find_all('td', class_='match-details'):
        home_tag = match.find('span', class_='team-home')
        home = home_tag and ''.join(home_tag.stripped_strings)
        score_tag = match.find('span', class_='score')
        score = score_tag and ''.join(score_tag.stripped_strings)
        away_tag = match.find('span', class_='team-away')
        away = away_tag and ''.join(away_tag.stripped_strings)

        if ( score.split('-')[0] > score.rsplit('-')[1] ):
            home_win = 1
        else:
            home_win = 0

        if (score.rsplit('-')[1] > score.split('-')[0] ):
            away_win = 1
        else:
            away_win = 0


        if home and score and away:
            print league,',',home,',',home_win,',',score
            print league,',',away,',',away_win,',',score

您需要從標題標簽中檢索日期並添加一些邏輯來檢查,例如,所需日期的列表。

使用正則表達式從標題標簽字符串中檢索日期並將其轉換為日期時間對象:

date_pattern = re.compile(r'(\d{1,2})\w{2}\s([\w]+\s\d{4})') # pattern

played_date = re.findall(date_pattern, played_tag.get_text()) # find pattern in caption tag string played_tag
played_date = ' '.join(played_date[0])
played_date = datetime.strptime(played_date, '%d %B %Y') # convert it into datetime object

並定義要抓取的日期列表:

list_of_dates = ['2017-01-07', '2017-01-06'] # put all dates to be print here

def dates_to_datetime(dates):
    """ Converts a list of date strings to a list of datetime objects """
    datetime_objs = []
    for d in dates:
        datetime_objs.append(datetime.strptime(d, '%Y-%m-%d'))
    return datetime_objs

list_of_dates = dates_to_datetime(list_of_dates) # converts list_of_dates to list of datetime objects

最后,您需要添加一個 if 語句來檢查從標題標簽字符串中檢索到的日期是否在您的日期列表中:

# check if retrieved date is in list_of_dates
if played_date in list_of_dates:

這是完整的示例:

import re
from datetime import datetime
from bs4 import BeautifulSoup
import urllib2
import csv


list_of_dates = ['2017-01-07', '2017-01-06'] # put all dates to be print here


def dates_to_datetime(dates):
    """ Converts a list of date strings to a list of datetime objects """
    datetime_objs = []
    for d in dates:
        datetime_objs.append(datetime.strptime(d, '%Y-%m-%d'))
    return datetime_objs


list_of_dates = dates_to_datetime(list_of_dates) # converts list_of_dates to list of datetime objects

date_pattern = re.compile(r'(\d{1,2})\w{2}\s([\w]+\s\d{4})') # date pattern in <caption> tag i.e.: 9th January 2017

url = 'http://www.bbc.co.uk/sport/football/fa-cup/results'
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page)

league =  'FA'

for games in soup.find_all('table', class_='table-stats'):
    played_tag = games.find('caption')

    # retrieve date from caption tag
    played_date = re.findall(date_pattern, played_tag.get_text())
    played_date = ' '.join(played_date[0])
    played_date = datetime.strptime(played_date, '%d %B %Y')

    # check if retrieved date is in list_of_dates
    if played_date in list_of_dates:

        played = played_tag and ''.join(played_tag.stripped_strings)
        print played

        for match in soup.find_all('td', class_='match-details'):
            home_tag = match.find('span', class_='team-home')
            home = home_tag and ''.join(home_tag.stripped_strings)
            score_tag = match.find('span', class_='score')
            score = score_tag and ''.join(score_tag.stripped_strings)
            away_tag = match.find('span', class_='team-away')
            away = away_tag and ''.join(away_tag.stripped_strings)

            if ( score.split('-')[0] > score.rsplit('-')[1] ):
                home_win = 1
            else:
                home_win = 0

            if (score.rsplit('-')[1] > score.split('-')[0] ):
                away_win = 1
            else:
                away_win = 0


            if home and score and away:
                print league,',',home,',',home_win,',',score
                print league,',',away,',',away_win,',',score
    else:
        pass

感謝@Benjamin 讓我走上正軌,為了檢查第二個循環中的日期,對他的答案稍作修改,我知道效率低下,因為它會遍歷所選每個日期的所有數據,但確實如此完成我的最終目標。

import re
from datetime import datetime
from bs4 import BeautifulSoup
import urllib2
import csv


list_of_dates = ['2016-11-06', '2016-11-05'] # put all dates to be print here


def dates_to_datetime(dates):
    """ Converts a list of date strings to a list of datetime objects """
    datetime_objs = []
    for d in dates:
        datetime_objs.append(datetime.strptime(d, '%Y-%m-%d'))
    return datetime_objs


list_of_dates = dates_to_datetime(list_of_dates) # converts list_of_dates to list of datetime objects

date_pattern = re.compile(r'(\d{1,2})\w{2}\s([\w]+\s\d{4})') # date pattern in <caption> tag i.e.: 9th January 2017

url = 'http://www.bbc.co.uk/sport/football/fa-cup/results'
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page)

league =  'FA'

for games in soup.find_all('table', class_='table-stats'):
    played_tag = games.find('caption')

    # retrieve date from caption tag
    played_date = re.findall(date_pattern, played_tag.get_text())
    played_date = ' '.join(played_date[0])
    played_date = datetime.strptime(played_date, '%d %B %Y')

    # check if retrieved date is in list_of_dates
    if played_date in list_of_dates:

        for match in soup.find_all('td', class_='match-details'):

        # get parent match date            
            match_date = match.parent.parent.parent.caption
            if match_date == played_tag:
                home_tag = match.find('span', class_='team-home')
                home = home_tag and ''.join(home_tag.stripped_strings)
                score_tag = match.find('span', class_='score')
                score = score_tag and ''.join(score_tag.stripped_strings)
                away_tag = match.find('span', class_='team-away')
                away = away_tag and ''.join(away_tag.stripped_strings)

                if ( score.split('-')[0] > score.rsplit('-')[1] ):
                    home_win = 1
                else:
                    home_win = 0

                if (score.rsplit('-')[1] > score.split('-')[0] ):
                    away_win = 1
                else:
                    away_win = 0


                if home and score and away:
                    print played_date, ',', league,',',home,',',home_win
                    print played_date, ',', league,',',away,',',away_win
        else:
            pass

這更好,因為它不會只循環所有需要的游戲。

import re
from datetime import datetime
from bs4 import BeautifulSoup
import urllib2
import csv

f = open('/FACup.csv', 'wt')
writer = csv.writer(f)

list_of_dates = ['2017-01-09', '2017-01-08', '2017-01-07', '2017-01-06'] # put all dates to be print here


def dates_to_datetime(dates):
    """ Converts a list of date strings to a list of datetime objects """
    datetime_objs = []
    for d in dates:
        datetime_objs.append(datetime.strptime(d, '%Y-%m-%d'))
    return datetime_objs


list_of_dates = dates_to_datetime(list_of_dates) # converts list_of_dates to list of datetime objects

date_pattern = re.compile(r'(\d{1,2})\w{2}\s([\w]+\s\d{4})') # date pattern in <caption> tag i.e.: 9th January 2017

url = 'http://www.bbc.co.uk/sport/football/fa-cup/results'
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page)

league =  'FA'

for games in soup.find_all('table', class_='table-stats'):
    played_tag = games.find('caption')

    # retrieve date from caption tag
    played_date = re.findall(date_pattern, played_tag.get_text())
    played_date = ' '.join(played_date[0])
    played_date = datetime.strptime(played_date, '%d %B %Y')

    # check if retrieved date is in list_of_dates
    if played_date in list_of_dates:

        for match in games.find_all('td', class_='match-details'):
            home_tag = match.find('span', class_='team-home')
            home = home_tag and ''.join(home_tag.stripped_strings)
            score_tag = match.find('span', class_='score')
            score = score_tag and ''.join(score_tag.stripped_strings)
            away_tag = match.find('span', class_='team-away')
            away = away_tag and ''.join(away_tag.stripped_strings)

            if ( score.split('-')[0] > score.rsplit('-')[1] ):
                home_win = 1
            else:
                home_win = 0

            if (score.rsplit('-')[1] > score.split('-')[0] ):
                away_win = 1
            else:
                away_win = 0


            if home and score and away:
                writer.writerow( (league, played_date.strftime('%Y-%m-%d'), home, home_win, 'H') )
                writer.writerow( (league, played_date.strftime('%Y-%m-%d'), away, away_win, 'A') )
    else:
        pass

f.close()

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM