![](/img/trans.png)
[英]Response from website doesn't contain elements I'm trying to find using Xpath and Scrapy. However using ChroPath they are there
[英]I'm trying to scrape data from a website using Scrapy. What's wrong with my code?
我正在嘗試從https://www.premierleague.com/players抓取數據。 在網頁上,有一個玩家列表。 我使用 xpath 表達式 response.xpath('//td/a/@href').getall() 來獲取每個玩家的相對 url 列表。 然后我遍歷了相對 url 的列表並將它們與主頁合並以獲得一個名為“absolute_url”的變量,對於其中一名球員“https://www.premierleague.com”+“/players/63289”看起來像這樣/Brenden-Aaronson/overview" https://www.premierleague.com/players/63289/Brenden-Aaronson/overview 。 我在 scrapy shell 上測試了 xpath,他們在 scrapy shell 上生成了所需的 output ......至少對於我測試的播放器的概述頁面。 我哪里錯了?
import scrapy
from urllib.parse import urljoin
class PlStatsSpider(scrapy.Spider):
name = 'pl_stats'
allowed_domains = ['premierleague.com']
start_urls = ['http://premierleague.com']
def parse(self, response):
url = 'http://premierleague.com'
for link in response.xpath('//td/a/@href').getall():
absolute_url = urljoin(url, link) #merging relative url
yield response.follow(absolute_url, callback=self.parse_players)
def parse_players(self, response):
yield {
'Name': response.xpath('//h1/div[@class="name t-colour"]/text()').get(),
'DOB': response.xpath('//div[@class="personalLists"]//div[@class="info"]/text()')[3].get().strip(),
'Height': response.xpath('//div[@class="personalLists"]//div[@class="info"]/text()')[5].get(),
'Club': response.xpath('//div[@class="info"]/a/text()').get().strip(),
'Weight': response.xpath('//div[@class="personalLists"]//div[@class="info"]/text()')[6].get(),
'Position': response.xpath('//section[@class="sideWidget playerIntro t2-topBorder"]//div[@class="info"]/text()')[2].get(),
'Nationality': response.xpath('//span[@class="playerCountry"]/text()').get()}
對於您試圖從中抓取數據的許多不同頁面,您的大多數 xpath 有點過於模糊。 所有的播放器頁面都有細微的變化,這使得使用位置索引提取數據幾乎是不可能的。 此外,並非每個字段都適用於每個玩家,例如position
和club
。 您可以為這些字段做的是遍歷它們的部分元素並獲取所有“標簽”、“信息”對並匹配您的 output 可用的任何內容。
例如:
import scrapy
class PlStatsSpider(scrapy.Spider):
name = 'pl_stats'
allowed_domains = ['premierleague.com']
start_urls = ['https://www.premierleague.com/players']
def parse(self, response):
for link in response.xpath('//td/a/@href').getall():
yield scrapy.Request(response.urljoin(link), callback=self.parse_players)
def parse_players(self, response):
section = response.xpath("//section[contains(@class,'sideWidget playerIntro')]")
info = {'label': [], 'info': []}
for classval in info.keys():
idents = section.xpath(f"./div[@class='{classval}']//text()").getall()
idents = set([i.strip() for i in idents if i.strip()])
info[classval] = list(idents)
item = {k.title(): v for k,v in zip(info['label'], info['info'])}
item.update({
'Name': response.xpath('//div[@class="name t-colour"]/text()').get(),
'DOB': response.xpath('//ul[@class="pdcol2"]//div[@class="info"]/text()').get().strip(),
'Height': response.xpath('//ul[@class="pdcol3"]/li/div[@class="info"]/text()').get(),
'Weight': response.xpath('//ul[@class="pdcol3"]/li[@class="u-hide"]/div[@class="info"]/text()').get(),
'Nationality': response.xpath('//span[@class="playerCountry"]/text()').get()
})
yield item
這是調用scrapy crawl pl_stats -o players.json
后生成的 json 文件。
[
{
"Position": "Defender",
"Name": "Max Aarons",
"DOB": "04/01/2000",
"Height": "178cm",
"Weight": null,
"Nationality": "England"
},
{
"Position": "Forward",
"Club": "Manchester City",
"Name": "Juli\u00e1n \u00c1lvarez",
"DOB": "31/01/2000",
"Height": "170cm",
"Weight": "71kg",
"Nationality": "Argentina"
},
{
"Position": "Defender",
"Club": "Leicester City",
"Name": "Daniel Amartey",
"DOB": "21/12/1994",
"Height": "186cm",
"Weight": "79kg",
"Nationality": "Ghana"
},
{
"Position": "Forward",
"Name": "Will Alves",
"DOB": "04/05/2005",
"Height": null,
"Weight": null,
"Nationality": "England"
},
{
"Position": "Midfielder",
"Club": "Brighton and Hove Albion",
"Name": "Steven Alzate",
"DOB": "08/09/1998",
"Height": "180cm",
"Weight": "75kg",
"Nationality": "Colombia"
},
{
"Position": "Defender",
"Name": "Marcos Alonso",
"DOB": "28/12/1990",
"Height": "188cm",
"Weight": null,
"Nationality": "Spain"
},
{
"Position": "Midfielder",
"Name": "Jaime Alvarado",
"DOB": "26/07/1999",
"Height": "179cm",
"Weight": null,
"Nationality": "Colombia"
},
{
"Position": "Midfielder",
"Club": "Newcastle United",
"Name": "Miguel Almir\u00f3n",
"DOB": "10/02/1994",
"Height": "174cm",
"Weight": "70kg",
"Nationality": "Paraguay"
},
{
"Position": "Goalkeeper",
"Name": "\u00c1lvaro Fern\u00e1ndez",
"DOB": "13/04/1998",
"Height": "185cm",
"Weight": null,
"Nationality": "Spain"
},
{
"Position": "Midfielder",
"Club": "Everton",
"Name": "Allan",
"DOB": "08/01/1991",
"Height": "173cm",
"Weight": "73kg",
"Nationality": "Brazil"
},
{
"Position": "Goalkeeper",
"Club": "Liverpool",
"Name": "Alisson",
"DOB": "02/10/1992",
"Height": "191cm",
"Weight": "91kg",
"Nationality": "Brazil"
},
{
"Position": "Defender",
"Name": "Ezgjan Alioski",
"DOB": "12/02/1992",
"Height": "173cm",
"Weight": null,
"Nationality": "North Macedonia"
},
{
"Position": "Midfielder",
"Name": "Dele Alli",
"DOB": "11/04/1996",
"Height": "188cm",
"Weight": null,
"Nationality": "England"
},
{
"Position": "Defender",
"Name": "Alex Telles",
"DOB": "15/12/1992",
"Height": "181cm",
"Weight": null,
"Nationality": "Brazil"
},
{
"Position": "Defender",
"Club": "Liverpool",
"Name": "Trent Alexander-Arnold",
"DOB": "07/10/1998",
"Height": "175cm",
"Weight": "69kg",
"Nationality": "England"
},
{
"Position": "Defender",
"Name": "Ajibola Alese",
"DOB": "17/01/2001",
"Height": null,
"Weight": null,
"Nationality": "England"
},
{
"Position": "Defender",
"Name": "Toby Alderweireld",
"DOB": "02/03/1989",
"Height": "186cm",
"Weight": null,
"Nationality": "Belgium"
},
{
"Position": "Defender",
"Club": "Manchester City",
"Name": "Nathan Ak\u00e9",
"DOB": "18/02/1995",
"Height": "180cm",
"Weight": "75kg",
"Nationality": "Netherlands"
},
{
"Position": "Defender",
"Club": "Brentford",
"Name": "Kristoffer Ajer",
"DOB": "17/04/1998",
"Height": "198cm",
"Weight": "92kg",
"Nationality": "Norway"
},
{
"Position": "Midfielder",
"Club": "Leicester City",
"Name": "Marc Albrighton",
"DOB": "18/11/1989",
"Height": "175cm",
"Weight": "74kg",
"Nationality": "England"
},
{
"Position": "Defender",
"Club": "Wolverhampton Wanderers",
"Name": "Rayan A\u00eft-Nouri",
"DOB": "06/06/2001",
"Height": "179cm",
"Weight": "70kg",
"Nationality": "France"
},
{
"Position": "Defender",
"Name": "Ryan Alebiosu",
"DOB": "17/12/2001",
"Height": null,
"Weight": null,
"Nationality": "England"
},
{
"Position": "Defender",
"Name": "Ahmed El Mohamady",
"DOB": "09/09/1987",
"Height": "183cm",
"Weight": null,
"Nationality": "Egypt"
},
{
"Position": "Defender",
"Name": "Derek Agyakwa",
"DOB": "19/12/2001",
"Height": null,
"Weight": null,
"Nationality": "Netherlands"
},
{
"Position": "Forward",
"Name": "Sergio Ag\u00fcero",
"DOB": "02/06/1988",
"Height": "173cm",
"Weight": null,
"Nationality": "Argentina"
},
{
"Position": "Defender",
"Name": "Tayo Adaramola",
"DOB": "14/11/2003",
"Height": null,
"Weight": null,
"Nationality": "Ireland"
},
{
"Position": "Goalkeeper",
"Club": "Liverpool",
"Name": "Adri\u00e1n",
"DOB": "03/01/1987",
"Height": "190cm",
"Weight": "80kg",
"Nationality": "Spain"
},
{
"Position": "Southampton",
"Club": "Forward",
"Name": "Che Adams",
"DOB": "13/07/1996",
"Height": "175cm",
"Weight": "70kg",
"Nationality": "Scotland"
},
{
"Position": "Southampton",
"Club": "Forward",
"Name": "Adam Armstrong",
"DOB": "10/02/1997",
"Height": "174cm",
"Weight": "69kg",
"Nationality": "England"
},
{
"Position": "Forward",
"Name": "Tammy Abraham",
"DOB": "02/10/1997",
"Height": "190cm",
"Weight": null,
"Nationality": "England"
}
]
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.