[英]Stuck with Data Crawling on Scrapy
我的一位朋友正在開發一個抓取腳本,以從頁面抓取數據。 一段時間后,我需要添加另一個字段。 並且我成功添加了該字段。 但是問題是該字段沒有獲取td內部鏈接的數據。 字段名稱是“最后的蝙蝠俠”
資料網址:
http://digicricket.marssil.com/match/MatchData.aspx?op=1&match=1385
數據的XPath:
// * [@@ =“ =” ctl00_ContentPlaceHolder1_divData“] / table [6] / tr / td
import scrapy
from bs4 import BeautifulSoup
from scrapy.exceptions import CloseSpider
from scrapy.selector import Selector
from digicricket.items import ODIorTestItem
class DigicricketMarsilOp1Spider(scrapy.Spider):
name = "digicricket.marssil.op1"
allowed_domains = ["digicricket.marssil.com"]
def __init__(self, match_id=None):
if match_id:
match_id_list = match_id.split(',')
for i in match_id_list:
if not i.isdigit():
raise CloseSpider('Match ID = {0} is not a number'.format(i))
else:
self.start_urls = ['http://digicricket.marssil.com/match/MatchData.aspx?op=1&match={0}'.format(i)
for i in match_id_list]
else:
raise CloseSpider('You forgot input Match ID/IDs')
def parse(self, response):
item = ODIorTestItem()
item['Batsman_op1'] = []
item['Bowler_op1'] = []
item['other_op1'] = []
sel = Selector(response)
tables = sel.xpath('//div[@id="ctl00_ContentPlaceHolder1_divData"]/table').extract()
row_for_other = dict()
for i in xrange(len(tables)):
html_text = BeautifulSoup(tables[i])
if i == 1:
sl = 0
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if td:
sl += 1
row = dict()
row['sl'] = sl
row['match_id'] = response.url[response.url.rfind('=')+1:]
row["Batsman"] = td[0].get_text()
row["R"] = td[1].get_text()
row["B"] = td[2].get_text()
row["4s"] = td[3].get_text()
row["6s"] = td[4].get_text()
row["SR"] = td[5].get_text()
item['Batsman_op1'].append(row)
elif i == 2:
sl = 0
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if td:
sl += 1
row = dict()
row['sl'] = sl
row['match_id'] = response.url[response.url.rfind('=')+1:]
row["Bowler"] = td[0].get_text()
row["O"] = td[1].get_text()
row["M"] = td[2].get_text()
row["R"] = td[3].get_text()
row["W"] = td[4].get_text()
row["Econ"] = td[5].get_text()
item['Bowler_op1'].append(row)
else:
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if i == 0:
try:
row_for_other["InningsMatchDetails"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[1]').extract()[0]
except:
row_for_other["InningsMatchDetails"] = None
try:
row_for_other["CurrentScore"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/span/text()').extract()[0]
except:
row_for_other["CurrentScore"] = None
try:
row_for_other["OversRunRate"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[2]').extract()[0]
except:
row_for_other["OversRunRate"] = None
try:
row_for_other["Extras"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/table[1]/'
'tr/td/b/text()[3]').extract()[0]
except:
row_for_other["Extras"] = None
try:
row_for_other["MatchResult"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[4]').extract()[0]
except:
row_for_other["MatchResult"] = None
try:
row_for_other["RecentOvers"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
'table[4]/tr/td[2]/text()').extract()[0]
except:
row_for_other["RecentOvers"] = None
try:
row_for_other["LastBatsman"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
'table[6]/tr/td/text()').extract()[0]
except:
row_for_other["LastBatsman"] = None
row_for_other['match_id'] = response.url[response.url.rfind('=')+1:]
item['other_op1'].append(row_for_other)
return item
您的XPath似乎缺少一些標簽。 在網頁上,第二個table
之前有兩個div
級別。 用//
替換/
可以解決這些問題。 (由於我的瀏覽器添加了一些<tbody>
標記,因此tr
前面也有一個雙斜杠。
.//*[@ id =“ ctl00_ContentPlaceHolder1_divData”] // table [6] // tr / td / a [1] / text()
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.