I'm having trouble formatting my scraped data, any advice on how I could extract my data into four columns (Winning Team, Losing Team, Winning Score, Losing Score)
import scrapy
class sportsDataSpider(scrapy.Spider):
name = "sportsSite"
allowed_domains = ["www.espn.com"]
start_urls = ["https://www.espn.com/nhl/scoreboard/_/date/20220504"]
handle_httpstatus_list = [404]
def parse(self, response, **kwargs):
hockey_score_selector = response.css(".ScoreCell__Team--scoreboard").extract()
loser_sel = ".ScoreboardScoreCell__Item--loser .ScoreCell__Score::text"
winner_sel = ".ScoreboardScoreCell__Item--winner .ScoreCell__Score::text"
team_sel = ".ScoreboardPage .ScoreCell__TeamName--shortDisplayName::text"
loser_score = response.css(loser_sel).extract()
winner_score = response.css(winner_sel).extract()
teams = response.css(team_sel).extract()
yield {
'losing score': loser_score,
'winning score': winner_score,
'teams': teams
}
This is my current output I get from this code.
{'losing score': ['2', '3', '2', '0'], 'winning score': ['5', '5', '6', '6'], 'teams': ['Bruins', 'Hurricanes', 'Lightning', 'Maple Leafs', 'Blues', 'Wild', 'Kings', 'Oilers']}
Instead of collecting all the teams at once based on .ScoreboardPage
, try collecting two sets based on .ScoreboardScoreCell__Item--loser
and .ScoreboardScoreCell__Item--winner
. So:
def parse(self, response, **kwargs):
hockey_score_selector = response.css(".ScoreCell__Team--scoreboard").extract()
loser_sel = ".ScoreboardScoreCell__Item--loser .ScoreCell__Score::text"
winner_sel = ".ScoreboardScoreCell__Item--winner .ScoreCell__Score::text"
# team_sel = ".ScoreboardPage .ScoreCell__TeamName--shortDisplayName::text"
loser_team_sel = ".ScoreboardScoreCell__Item--loser .ScoreCell__TeamName--shortDisplayName::text"
winner_team_sel = ".ScoreboardScoreCell__Item--winner .ScoreCell__TeamName--shortDisplayName::text"
loser_score = response.css(loser_sel).extract()
winner_score = response.css(winner_sel).extract()
# teams = response.css(team_sel).extract()
loser_teams = response.css(loser_team_sel).extract()
winner_teams = response.css(winner_team_sel).extract()
yield {
'losing score': loser_score,
'winning score': winner_score,
# 'teams': teams,
'losing team': loser_teams,
'winning team': winner_teams
}
Output:
{'losing score': ['2', '3', '2', '0'],
'winning score': ['5', '5', '6', '6'],
'losing team': ['Bruins', 'Maple Leafs', 'Blues', 'Kings'],
'winning team': ['Hurricanes', 'Lightning', 'Wild', 'Oilers']}
Eg Bruins lost to Hurricanes, etc.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.