繁体   English   中英

卡在Scrapy上抓取数据

[英]Stuck with Data Crawling on Scrapy

我的一位朋友正在开发一个抓取脚本,以从页面抓取数据。 一段时间后,我需要添加另一个字段。 并且我成功添加了该字段。 但是问题是该字段没有获取td内部链接的数据。 字段名称是“最后的蝙蝠侠”

资料网址:

http://digicricket.marssil.com/match/MatchData.aspx?op=1&match=1385

数据的XPath:

// * [@@ =“ =” ctl00_ContentPlaceHolder1_divData“] / table [6] / tr / td

import scrapy
from bs4 import BeautifulSoup
from scrapy.exceptions import CloseSpider
from scrapy.selector import Selector

from digicricket.items import ODIorTestItem


class DigicricketMarsilOp1Spider(scrapy.Spider):
    name = "digicricket.marssil.op1"
    allowed_domains = ["digicricket.marssil.com"]

def __init__(self, match_id=None):
    if match_id:
        match_id_list = match_id.split(',')
        for i in match_id_list:
            if not i.isdigit():
                raise CloseSpider('Match ID = {0} is not a number'.format(i))
        else:
            self.start_urls = ['http://digicricket.marssil.com/match/MatchData.aspx?op=1&match={0}'.format(i)
                               for i in match_id_list]
    else:
        raise CloseSpider('You forgot input Match ID/IDs')

def parse(self, response):
    item = ODIorTestItem()
    item['Batsman_op1'] = []
    item['Bowler_op1'] = []
    item['other_op1'] = []
    sel = Selector(response)
    tables = sel.xpath('//div[@id="ctl00_ContentPlaceHolder1_divData"]/table').extract()
    row_for_other = dict()
    for i in xrange(len(tables)):
        html_text = BeautifulSoup(tables[i])
        if i == 1:
            sl = 0
            for tr in html_text.find_all('tr'):
                td = tr.find_all('td')
                if td:
                    sl += 1
                    row = dict()
                    row['sl'] = sl
                    row['match_id'] = response.url[response.url.rfind('=')+1:]
                    row["Batsman"] = td[0].get_text()
                    row["R"] = td[1].get_text()
                    row["B"] = td[2].get_text()
                    row["4s"] = td[3].get_text()
                    row["6s"] = td[4].get_text()
                    row["SR"] = td[5].get_text()
                    item['Batsman_op1'].append(row)
        elif i == 2:
            sl = 0
            for tr in html_text.find_all('tr'):
                td = tr.find_all('td')
                if td:
                    sl += 1
                    row = dict()
                    row['sl'] = sl
                    row['match_id'] = response.url[response.url.rfind('=')+1:]
                    row["Bowler"] = td[0].get_text()
                    row["O"] = td[1].get_text()
                    row["M"] = td[2].get_text()
                    row["R"] = td[3].get_text()
                    row["W"] = td[4].get_text()
                    row["Econ"] = td[5].get_text()
                    item['Bowler_op1'].append(row)
        else:
            for tr in html_text.find_all('tr'):
                td = tr.find_all('td')

            if i == 0:
                try:
                    row_for_other["InningsMatchDetails"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
                                                                     'table[1]/tr/td/b/text()[1]').extract()[0]
                except:
                    row_for_other["InningsMatchDetails"] = None
                try:
                    row_for_other["CurrentScore"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
                                                              'table[1]/tr/td/b/span/text()').extract()[0]
                except:
                    row_for_other["CurrentScore"] = None
                try:
                    row_for_other["OversRunRate"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
                                                              'table[1]/tr/td/b/text()[2]').extract()[0]
                except:
                    row_for_other["OversRunRate"] = None
                try:
                    row_for_other["Extras"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/table[1]/'
                                                        'tr/td/b/text()[3]').extract()[0]
                except:
                    row_for_other["Extras"] = None
                try:
                    row_for_other["MatchResult"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
                                                             'table[1]/tr/td/b/text()[4]').extract()[0]
                except:
                    row_for_other["MatchResult"] = None
                try:
                    row_for_other["RecentOvers"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
                                                             'table[4]/tr/td[2]/text()').extract()[0]
                except:
                    row_for_other["RecentOvers"] = None
                try:
                    row_for_other["LastBatsman"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
                                                             'table[6]/tr/td/text()').extract()[0]
                except:
                    row_for_other["LastBatsman"] = None

    row_for_other['match_id'] = response.url[response.url.rfind('=')+1:]
    item['other_op1'].append(row_for_other)
    return item

您的XPath似乎缺少一些标签。 在网页上,第二个table之前有两个div级别。 //替换/可以解决这些问题。 (由于我的浏览器添加了一些<tbody>标记,因此tr前面也有一个双斜杠。

.//*[@ id =“ ctl00_ContentPlaceHolder1_divData”] // table [6] // tr / td / a [1] / text()

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM