[英]python and beautifulsoup scraping football
正在看一个足球抓取python程序,在stackoverflow上发现了这个适合我的需求, Python Beautifulsoup4网站解析
我想要做的是修改它以只返回特定日期或一组日期的分数,只是无法弄清楚如何做到这一点。 但这只是返回每个日期的每个结果。 Thx, 马尔
from bs4 import BeautifulSoup
import urllib2
import csv
url = 'http://www.bbc.co.uk/sport/football/fa-cup/results'
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page)
league = 'FA'
for games in soup.find_all('table', class_='table-stats'):
played_tag = games.find('caption')
played = played_tag and ''.join(played_tag.stripped_strings)
print played
for match in soup.find_all('td', class_='match-details'):
home_tag = match.find('span', class_='team-home')
home = home_tag and ''.join(home_tag.stripped_strings)
score_tag = match.find('span', class_='score')
score = score_tag and ''.join(score_tag.stripped_strings)
away_tag = match.find('span', class_='team-away')
away = away_tag and ''.join(away_tag.stripped_strings)
if ( score.split('-')[0] > score.rsplit('-')[1] ):
home_win = 1
else:
home_win = 0
if (score.rsplit('-')[1] > score.split('-')[0] ):
away_win = 1
else:
away_win = 0
if home and score and away:
print league,',',home,',',home_win,',',score
print league,',',away,',',away_win,',',score
您需要从标题标签中检索日期并添加一些逻辑来检查,例如,所需日期的列表。
使用正则表达式从标题标签字符串中检索日期并将其转换为日期时间对象:
date_pattern = re.compile(r'(\d{1,2})\w{2}\s([\w]+\s\d{4})') # pattern
played_date = re.findall(date_pattern, played_tag.get_text()) # find pattern in caption tag string played_tag
played_date = ' '.join(played_date[0])
played_date = datetime.strptime(played_date, '%d %B %Y') # convert it into datetime object
并定义要抓取的日期列表:
list_of_dates = ['2017-01-07', '2017-01-06'] # put all dates to be print here
def dates_to_datetime(dates):
""" Converts a list of date strings to a list of datetime objects """
datetime_objs = []
for d in dates:
datetime_objs.append(datetime.strptime(d, '%Y-%m-%d'))
return datetime_objs
list_of_dates = dates_to_datetime(list_of_dates) # converts list_of_dates to list of datetime objects
最后,您需要添加一个 if 语句来检查从标题标签字符串中检索到的日期是否在您的日期列表中:
# check if retrieved date is in list_of_dates
if played_date in list_of_dates:
这是完整的示例:
import re
from datetime import datetime
from bs4 import BeautifulSoup
import urllib2
import csv
list_of_dates = ['2017-01-07', '2017-01-06'] # put all dates to be print here
def dates_to_datetime(dates):
""" Converts a list of date strings to a list of datetime objects """
datetime_objs = []
for d in dates:
datetime_objs.append(datetime.strptime(d, '%Y-%m-%d'))
return datetime_objs
list_of_dates = dates_to_datetime(list_of_dates) # converts list_of_dates to list of datetime objects
date_pattern = re.compile(r'(\d{1,2})\w{2}\s([\w]+\s\d{4})') # date pattern in <caption> tag i.e.: 9th January 2017
url = 'http://www.bbc.co.uk/sport/football/fa-cup/results'
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page)
league = 'FA'
for games in soup.find_all('table', class_='table-stats'):
played_tag = games.find('caption')
# retrieve date from caption tag
played_date = re.findall(date_pattern, played_tag.get_text())
played_date = ' '.join(played_date[0])
played_date = datetime.strptime(played_date, '%d %B %Y')
# check if retrieved date is in list_of_dates
if played_date in list_of_dates:
played = played_tag and ''.join(played_tag.stripped_strings)
print played
for match in soup.find_all('td', class_='match-details'):
home_tag = match.find('span', class_='team-home')
home = home_tag and ''.join(home_tag.stripped_strings)
score_tag = match.find('span', class_='score')
score = score_tag and ''.join(score_tag.stripped_strings)
away_tag = match.find('span', class_='team-away')
away = away_tag and ''.join(away_tag.stripped_strings)
if ( score.split('-')[0] > score.rsplit('-')[1] ):
home_win = 1
else:
home_win = 0
if (score.rsplit('-')[1] > score.split('-')[0] ):
away_win = 1
else:
away_win = 0
if home and score and away:
print league,',',home,',',home_win,',',score
print league,',',away,',',away_win,',',score
else:
pass
感谢@Benjamin 让我走上正轨,为了检查第二个循环中的日期,对他的答案稍作修改,我知道效率低下,因为它会遍历所选每个日期的所有数据,但确实如此完成我的最终目标。
import re
from datetime import datetime
from bs4 import BeautifulSoup
import urllib2
import csv
list_of_dates = ['2016-11-06', '2016-11-05'] # put all dates to be print here
def dates_to_datetime(dates):
""" Converts a list of date strings to a list of datetime objects """
datetime_objs = []
for d in dates:
datetime_objs.append(datetime.strptime(d, '%Y-%m-%d'))
return datetime_objs
list_of_dates = dates_to_datetime(list_of_dates) # converts list_of_dates to list of datetime objects
date_pattern = re.compile(r'(\d{1,2})\w{2}\s([\w]+\s\d{4})') # date pattern in <caption> tag i.e.: 9th January 2017
url = 'http://www.bbc.co.uk/sport/football/fa-cup/results'
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page)
league = 'FA'
for games in soup.find_all('table', class_='table-stats'):
played_tag = games.find('caption')
# retrieve date from caption tag
played_date = re.findall(date_pattern, played_tag.get_text())
played_date = ' '.join(played_date[0])
played_date = datetime.strptime(played_date, '%d %B %Y')
# check if retrieved date is in list_of_dates
if played_date in list_of_dates:
for match in soup.find_all('td', class_='match-details'):
# get parent match date
match_date = match.parent.parent.parent.caption
if match_date == played_tag:
home_tag = match.find('span', class_='team-home')
home = home_tag and ''.join(home_tag.stripped_strings)
score_tag = match.find('span', class_='score')
score = score_tag and ''.join(score_tag.stripped_strings)
away_tag = match.find('span', class_='team-away')
away = away_tag and ''.join(away_tag.stripped_strings)
if ( score.split('-')[0] > score.rsplit('-')[1] ):
home_win = 1
else:
home_win = 0
if (score.rsplit('-')[1] > score.split('-')[0] ):
away_win = 1
else:
away_win = 0
if home and score and away:
print played_date, ',', league,',',home,',',home_win
print played_date, ',', league,',',away,',',away_win
else:
pass
这更好,因为它不会只循环所有需要的游戏。
import re
from datetime import datetime
from bs4 import BeautifulSoup
import urllib2
import csv
f = open('/FACup.csv', 'wt')
writer = csv.writer(f)
list_of_dates = ['2017-01-09', '2017-01-08', '2017-01-07', '2017-01-06'] # put all dates to be print here
def dates_to_datetime(dates):
""" Converts a list of date strings to a list of datetime objects """
datetime_objs = []
for d in dates:
datetime_objs.append(datetime.strptime(d, '%Y-%m-%d'))
return datetime_objs
list_of_dates = dates_to_datetime(list_of_dates) # converts list_of_dates to list of datetime objects
date_pattern = re.compile(r'(\d{1,2})\w{2}\s([\w]+\s\d{4})') # date pattern in <caption> tag i.e.: 9th January 2017
url = 'http://www.bbc.co.uk/sport/football/fa-cup/results'
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page)
league = 'FA'
for games in soup.find_all('table', class_='table-stats'):
played_tag = games.find('caption')
# retrieve date from caption tag
played_date = re.findall(date_pattern, played_tag.get_text())
played_date = ' '.join(played_date[0])
played_date = datetime.strptime(played_date, '%d %B %Y')
# check if retrieved date is in list_of_dates
if played_date in list_of_dates:
for match in games.find_all('td', class_='match-details'):
home_tag = match.find('span', class_='team-home')
home = home_tag and ''.join(home_tag.stripped_strings)
score_tag = match.find('span', class_='score')
score = score_tag and ''.join(score_tag.stripped_strings)
away_tag = match.find('span', class_='team-away')
away = away_tag and ''.join(away_tag.stripped_strings)
if ( score.split('-')[0] > score.rsplit('-')[1] ):
home_win = 1
else:
home_win = 0
if (score.rsplit('-')[1] > score.split('-')[0] ):
away_win = 1
else:
away_win = 0
if home and score and away:
writer.writerow( (league, played_date.strftime('%Y-%m-%d'), home, home_win, 'H') )
writer.writerow( (league, played_date.strftime('%Y-%m-%d'), away, away_win, 'A') )
else:
pass
f.close()
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.