[英]Scrapy spider can't seem to find the xpath for the next page
我的蜘蛛可以在第一页中抓取我想要的任何内容,但是当它尝试查找下一页的 xpath 时,我收到索引超出范围的错误。 我在 shell 中测试过,xpath 看起来不错,所以现在我不知道该怎么做。
rom scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from lrrytas.items import LrrytasItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class LrrytasSpider(Spider):
name = "lrrytas"
allowed_domains = ['http://www.lrytas.lt/']
start_urls = ["http://www.lrytas.lt/?id=14355922181434706286&view=6"]
rules = (
Rule(LinkExtractor(allow=r'Items'), callback='parse_item', follow=True),
Rule(LinkExtractor(restrict_xpaths=('//*[@class="comment-box-head"]/*')), callback='parse_comments_follow_next_page', follow=True)
)
def parse(self, response):
sel = Selector(response)
site = sel.xpath('//*[@class="comment"]/*')
node = sel.xpath('//*[@class="comments"]/*')
for i in range(0, len(site), 2):
item = LrrytasItem()
item['name'] = node[i].xpath('*/div[contains(@class, "comment-nr")]/text()').extract()[0]
item['ip'] = node[i].xpath('*/*/div[contains(@class, "comment-ip")]/text()').extract()[0]
item['time'] = node[i].xpath('*/*/div[contains(@class, "comment-time")]/text()').extract()[0]
item ['comment'] = site[i + 1].xpath('descendant-or-self::text()').extract()[0]
yield item
def parse_comments_follow_next_page(self, response):
next_page = xpath('//*[contains(text(), "Kitas >>") and contains(@href, "id")]/@href')
if next_page:
url = response.urljoin(next_page[0].extract())
yield Request(url, self.parse)
编辑:我使用len()
使循环更加自动化,然后手动
您的CrawlSpider
规则和 next_page 检查的XPath
在我看来似乎不太合适。 所以我想建议使用一个简单的Spider
并手动处理下一页请求。 我已经编译了一些代码来展示如何做到这一点:
import scrapy
class Comment(scrapy.Item):
name = scrapy.Field()
ip = scrapy.Field()
time = scrapy.Field()
class MySpider(scrapy.Spider):
name = 'lrytas'
allowed_domains = ['www.lrytas.lt']
start_urls = ['http://www.lrytas.lt/?id=14355922181434706286&view=6']
def parse(self, response):
xpath_comments = '//div[@class="comments"]/div[@class="comment"]'
sel_comments = response.xpath(xpath_comments)
for sel in sel_comments:
item = Comment()
item['name'] = ' '.join(sel.xpath('.//div[@class="comment-nr"]//text()').extract())
item['time'] = ' '.join(sel.xpath('.//div[@class="comment-time"]//text()').extract())
# Other item fields go here ...
yield item
# Check if there is a next page link ...
xpath_NextPage = './/a[contains(.,"Kitas >>")][1]/@href' # Take on of the two links
if response.xpath(xpath_NextPage):
# If YES: Create and submit request
url_NextPage = 'http://www.lrytas.lt' + response.xpath(xpath_NextPage).extract()[0]
request = scrapy.Request(url_NextPage, callback=self.parse)
yield request
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.