[英]Stuck with Data Crawling on Scrapy
我的一位朋友正在开发一个抓取脚本,以从页面抓取数据。 一段时间后,我需要添加另一个字段。 并且我成功添加了该字段。 但是问题是该字段没有获取td内部链接的数据。 字段名称是“最后的蝙蝠侠”
资料网址:
http://digicricket.marssil.com/match/MatchData.aspx?op=1&match=1385
数据的XPath:
// * [@@ =“ =” ctl00_ContentPlaceHolder1_divData“] / table [6] / tr / td
import scrapy
from bs4 import BeautifulSoup
from scrapy.exceptions import CloseSpider
from scrapy.selector import Selector
from digicricket.items import ODIorTestItem
class DigicricketMarsilOp1Spider(scrapy.Spider):
name = "digicricket.marssil.op1"
allowed_domains = ["digicricket.marssil.com"]
def __init__(self, match_id=None):
if match_id:
match_id_list = match_id.split(',')
for i in match_id_list:
if not i.isdigit():
raise CloseSpider('Match ID = {0} is not a number'.format(i))
else:
self.start_urls = ['http://digicricket.marssil.com/match/MatchData.aspx?op=1&match={0}'.format(i)
for i in match_id_list]
else:
raise CloseSpider('You forgot input Match ID/IDs')
def parse(self, response):
item = ODIorTestItem()
item['Batsman_op1'] = []
item['Bowler_op1'] = []
item['other_op1'] = []
sel = Selector(response)
tables = sel.xpath('//div[@id="ctl00_ContentPlaceHolder1_divData"]/table').extract()
row_for_other = dict()
for i in xrange(len(tables)):
html_text = BeautifulSoup(tables[i])
if i == 1:
sl = 0
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if td:
sl += 1
row = dict()
row['sl'] = sl
row['match_id'] = response.url[response.url.rfind('=')+1:]
row["Batsman"] = td[0].get_text()
row["R"] = td[1].get_text()
row["B"] = td[2].get_text()
row["4s"] = td[3].get_text()
row["6s"] = td[4].get_text()
row["SR"] = td[5].get_text()
item['Batsman_op1'].append(row)
elif i == 2:
sl = 0
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if td:
sl += 1
row = dict()
row['sl'] = sl
row['match_id'] = response.url[response.url.rfind('=')+1:]
row["Bowler"] = td[0].get_text()
row["O"] = td[1].get_text()
row["M"] = td[2].get_text()
row["R"] = td[3].get_text()
row["W"] = td[4].get_text()
row["Econ"] = td[5].get_text()
item['Bowler_op1'].append(row)
else:
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if i == 0:
try:
row_for_other["InningsMatchDetails"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[1]').extract()[0]
except:
row_for_other["InningsMatchDetails"] = None
try:
row_for_other["CurrentScore"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/span/text()').extract()[0]
except:
row_for_other["CurrentScore"] = None
try:
row_for_other["OversRunRate"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[2]').extract()[0]
except:
row_for_other["OversRunRate"] = None
try:
row_for_other["Extras"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/table[1]/'
'tr/td/b/text()[3]').extract()[0]
except:
row_for_other["Extras"] = None
try:
row_for_other["MatchResult"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[4]').extract()[0]
except:
row_for_other["MatchResult"] = None
try:
row_for_other["RecentOvers"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
'table[4]/tr/td[2]/text()').extract()[0]
except:
row_for_other["RecentOvers"] = None
try:
row_for_other["LastBatsman"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
'table[6]/tr/td/text()').extract()[0]
except:
row_for_other["LastBatsman"] = None
row_for_other['match_id'] = response.url[response.url.rfind('=')+1:]
item['other_op1'].append(row_for_other)
return item
您的XPath似乎缺少一些标签。 在网页上,第二个table
之前有两个div
级别。 用//
替换/
可以解决这些问题。 (由于我的浏览器添加了一些<tbody>
标记,因此tr
前面也有一个双斜杠。
.//*[@ id =“ ctl00_ContentPlaceHolder1_divData”] // table [6] // tr / td / a [1] / text()
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.