简体   繁体   中英

I'm trying to scrape data from a website using Scrapy. What's wrong with my code?

I'm trying to scrape data from https://www.premierleague.com/players . On the webpage, there are a list of players. I used an xpath expression response.xpath('//td/a/@href').getall() to get a list of relative urls for each player. I then iterated over the list of the relative urls and merged them with the homepage to get a variable called "absolute_url " which looks like this for one of the players "https://www.premierleague.com" + "/players/63289/Brenden-Aaronson/overview" https://www.premierleague.com/players/63289/Brenden-Aaronson/overview . I tested the xpath on scrapy shell and they produce the desired output on the scrapy shell...at least for the overview pages of players I tested. Where am I going wrong?

import scrapy
from urllib.parse import urljoin


class PlStatsSpider(scrapy.Spider):
    name = 'pl_stats'
    allowed_domains = ['premierleague.com']
    start_urls = ['http://premierleague.com']

def parse(self, response):
    url = 'http://premierleague.com'
    for link in response.xpath('//td/a/@href').getall():
        absolute_url = urljoin(url, link) #merging relative url 
        yield response.follow(absolute_url, callback=self.parse_players)

def parse_players(self, response):
    yield {
        'Name': response.xpath('//h1/div[@class="name t-colour"]/text()').get(),
        'DOB': response.xpath('//div[@class="personalLists"]//div[@class="info"]/text()')[3].get().strip(),
        'Height': response.xpath('//div[@class="personalLists"]//div[@class="info"]/text()')[5].get(),
        'Club': response.xpath('//div[@class="info"]/a/text()').get().strip(),
        'Weight': response.xpath('//div[@class="personalLists"]//div[@class="info"]/text()')[6].get(),
        'Position': response.xpath('//section[@class="sideWidget playerIntro t2-topBorder"]//div[@class="info"]/text()')[2].get(),
        'Nationality': response.xpath('//span[@class="playerCountry"]/text()').get()}

Most of your xpaths are a little too ambiguous for the many different pages you are trying to scrape data from. All of the player pages have slight variations that make extracting data using positional indexing nearly impossible. Additionally not every field is available for every player, such as the position and club . What you could do for those fields is iterate through their section elements and grabing all of the 'label','info' pairs and match whatever is available to your output.

For example:

import scrapy

class PlStatsSpider(scrapy.Spider):
    name = 'pl_stats'
    allowed_domains = ['premierleague.com']
    start_urls = ['https://www.premierleague.com/players']

    def parse(self, response):
        for link in response.xpath('//td/a/@href').getall():
            yield scrapy.Request(response.urljoin(link), callback=self.parse_players)

    def parse_players(self, response):
        section = response.xpath("//section[contains(@class,'sideWidget playerIntro')]")
        info = {'label': [], 'info': []}
        for classval in info.keys():
            idents = section.xpath(f"./div[@class='{classval}']//text()").getall()
            idents = set([i.strip() for i in idents if i.strip()])
            info[classval] = list(idents)
        item = {k.title(): v for k,v in zip(info['label'], info['info'])}
        item.update({
            'Name': response.xpath('//div[@class="name t-colour"]/text()').get(),
            'DOB': response.xpath('//ul[@class="pdcol2"]//div[@class="info"]/text()').get().strip(),
            'Height': response.xpath('//ul[@class="pdcol3"]/li/div[@class="info"]/text()').get(),
            'Weight': response.xpath('//ul[@class="pdcol3"]/li[@class="u-hide"]/div[@class="info"]/text()').get(),
            'Nationality': response.xpath('//span[@class="playerCountry"]/text()').get()
            })
        yield item

This is the json file that was produced after calling scrapy crawl pl_stats -o players.json .

[
  {
    "Position": "Defender",
    "Name": "Max Aarons",
    "DOB": "04/01/2000",
    "Height": "178cm",
    "Weight": null,
    "Nationality": "England"
  },
  {
    "Position": "Forward",
    "Club": "Manchester City",
    "Name": "Juli\u00e1n \u00c1lvarez",
    "DOB": "31/01/2000",
    "Height": "170cm",
    "Weight": "71kg",
    "Nationality": "Argentina"
  },
  {
    "Position": "Defender",
    "Club": "Leicester City",
    "Name": "Daniel Amartey",
    "DOB": "21/12/1994",
    "Height": "186cm",
    "Weight": "79kg",
    "Nationality": "Ghana"
  },
  {
    "Position": "Forward",
    "Name": "Will Alves",
    "DOB": "04/05/2005",
    "Height": null,
    "Weight": null,
    "Nationality": "England"
  },
  {
    "Position": "Midfielder",
    "Club": "Brighton and Hove Albion",
    "Name": "Steven Alzate",
    "DOB": "08/09/1998",
    "Height": "180cm",
    "Weight": "75kg",
    "Nationality": "Colombia"
  },
  {
    "Position": "Defender",
    "Name": "Marcos Alonso",
    "DOB": "28/12/1990",
    "Height": "188cm",
    "Weight": null,
    "Nationality": "Spain"
  },
  {
    "Position": "Midfielder",
    "Name": "Jaime Alvarado",
    "DOB": "26/07/1999",
    "Height": "179cm",
    "Weight": null,
    "Nationality": "Colombia"
  },
  {
    "Position": "Midfielder",
    "Club": "Newcastle United",
    "Name": "Miguel Almir\u00f3n",
    "DOB": "10/02/1994",
    "Height": "174cm",
    "Weight": "70kg",
    "Nationality": "Paraguay"
  },
  {
    "Position": "Goalkeeper",
    "Name": "\u00c1lvaro Fern\u00e1ndez",
    "DOB": "13/04/1998",
    "Height": "185cm",
    "Weight": null,
    "Nationality": "Spain"
  },
  {
    "Position": "Midfielder",
    "Club": "Everton",
    "Name": "Allan",
    "DOB": "08/01/1991",
    "Height": "173cm",
    "Weight": "73kg",
    "Nationality": "Brazil"
  },
  {
    "Position": "Goalkeeper",
    "Club": "Liverpool",
    "Name": "Alisson",
    "DOB": "02/10/1992",
    "Height": "191cm",
    "Weight": "91kg",
    "Nationality": "Brazil"
  },
  {
    "Position": "Defender",
    "Name": "Ezgjan Alioski",
    "DOB": "12/02/1992",
    "Height": "173cm",
    "Weight": null,
    "Nationality": "North Macedonia"
  },
  {
    "Position": "Midfielder",
    "Name": "Dele Alli",
    "DOB": "11/04/1996",
    "Height": "188cm",
    "Weight": null,
    "Nationality": "England"
  },
  {
    "Position": "Defender",
    "Name": "Alex Telles",
    "DOB": "15/12/1992",
    "Height": "181cm",
    "Weight": null,
    "Nationality": "Brazil"
  },
  {
    "Position": "Defender",
    "Club": "Liverpool",
    "Name": "Trent Alexander-Arnold",
    "DOB": "07/10/1998",
    "Height": "175cm",
    "Weight": "69kg",
    "Nationality": "England"
  },
  {
    "Position": "Defender",
    "Name": "Ajibola Alese",
    "DOB": "17/01/2001",
    "Height": null,
    "Weight": null,
    "Nationality": "England"
  },
  {
    "Position": "Defender",
    "Name": "Toby Alderweireld",
    "DOB": "02/03/1989",
    "Height": "186cm",
    "Weight": null,
    "Nationality": "Belgium"
  },
  {
    "Position": "Defender",
    "Club": "Manchester City",
    "Name": "Nathan Ak\u00e9",
    "DOB": "18/02/1995",
    "Height": "180cm",
    "Weight": "75kg",
    "Nationality": "Netherlands"
  },
  {
    "Position": "Defender",
    "Club": "Brentford",
    "Name": "Kristoffer Ajer",
    "DOB": "17/04/1998",
    "Height": "198cm",
    "Weight": "92kg",
    "Nationality": "Norway"
  },
  {
    "Position": "Midfielder",
    "Club": "Leicester City",
    "Name": "Marc Albrighton",
    "DOB": "18/11/1989",
    "Height": "175cm",
    "Weight": "74kg",
    "Nationality": "England"
  },
  {
    "Position": "Defender",
    "Club": "Wolverhampton Wanderers",
    "Name": "Rayan A\u00eft-Nouri",
    "DOB": "06/06/2001",
    "Height": "179cm",
    "Weight": "70kg",
    "Nationality": "France"
  },
  {
    "Position": "Defender",
    "Name": "Ryan Alebiosu",
    "DOB": "17/12/2001",
    "Height": null,
    "Weight": null,
    "Nationality": "England"
  },
  {
    "Position": "Defender",
    "Name": "Ahmed El Mohamady",
    "DOB": "09/09/1987",
    "Height": "183cm",
    "Weight": null,
    "Nationality": "Egypt"
  },
  {
    "Position": "Defender",
    "Name": "Derek Agyakwa",
    "DOB": "19/12/2001",
    "Height": null,
    "Weight": null,
    "Nationality": "Netherlands"
  },
  {
    "Position": "Forward",
    "Name": "Sergio Ag\u00fcero",
    "DOB": "02/06/1988",
    "Height": "173cm",
    "Weight": null,
    "Nationality": "Argentina"
  },
  {
    "Position": "Defender",
    "Name": "Tayo Adaramola",
    "DOB": "14/11/2003",
    "Height": null,
    "Weight": null,
    "Nationality": "Ireland"
  },
  {
    "Position": "Goalkeeper",
    "Club": "Liverpool",
    "Name": "Adri\u00e1n",
    "DOB": "03/01/1987",
    "Height": "190cm",
    "Weight": "80kg",
    "Nationality": "Spain"
  },
  {
    "Position": "Southampton",
    "Club": "Forward",
    "Name": "Che Adams",
    "DOB": "13/07/1996",
    "Height": "175cm",
    "Weight": "70kg",
    "Nationality": "Scotland"
  },
  {
    "Position": "Southampton",
    "Club": "Forward",
    "Name": "Adam Armstrong",
    "DOB": "10/02/1997",
    "Height": "174cm",
    "Weight": "69kg",
    "Nationality": "England"
  },
  {
    "Position": "Forward",
    "Name": "Tammy Abraham",
    "DOB": "02/10/1997",
    "Height": "190cm",
    "Weight": null,
    "Nationality": "England"
  }
]

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM