Scrapy 不抓取从分页收集的链接

Question

我正在尝试为其产品抓取电子商务网站，但我目前面临的问题是并非所有通过分页获得的页面都被访问。 链接本身是有效的、可访问的，而不是不存在的。

我的蜘蛛代码：

import scrapy
import json
from pbl.items import ShopCard

class SpidermaximaSpider(scrapy.Spider):
    name = 'spiderMaxima'
    allowed_domains = ['www.trobos.lt']
    start_urls = ['https://trobos.lt/prekes?vendor=MAXIMA']
    item = []
    list = [{
        'sid': 10,
        'name': 'Maxima',
        'domain': 'hhttps://www.maxima.lt/',
        'imageurl': 'https://upload.wikimedia.org/wikipedia/commons/c/c1/Maxima_logo.svg',
        'product': item
        }]

    def __init__(self):
        self.declare_xpath()

    def declare_xpath(self):
        self.getAllItemsXpath =  '//*[@id="category"]/div/div[1]/div/div[3]/div[4]/div/div/div/div/div/a/@href'
        self.TitleXpath  = '//*[@id="product"]/section[1]/div[3]/section/div[2]/h1/text()'    
        self.PriceXpath = '//*[@id="product"]/section[1]/div[3]/section/div[2]/div[1]/div/div[1]/div/div[1]/span/text()'

    def parse(self, response):
        for href in response.xpath(self.getAllItemsXpath):
            url = response.urljoin(href.extract())
            yield scrapy.Request(url=url,callback=self.parse_main_item, dont_filter=True)

        next_page = [response.url + '&page='+str(x) for x in range(1,193)]
        for page in next_page:
            print('-'* 100)
            print(page)
            print('-'* 100)
            url = page
            yield scrapy.Request(url, callback=self.parse)
     
    def parse_main_item(self,response): 
        shop = ShopCard()
        Title = response.xpath(self.TitleXpath).extract_first()
        Link = response.url
        Image = 'https://upload.wikimedia.org/wikipedia/commons/c/c1/Maxima_logo.svg'
        Price = response.xpath(self.PriceXpath).extract_first()
        Price = Price.replace(',', '.')
        Price = float(Price.split(' ')[0])

        shop['item'] = {
                'title': Title,
                'link': Link,
                'image': Image,
                'price': Price
            }

        self.item.append(shop['item'])
 
    def closed(self, reason):
        with open("spiderMaxima.json", "w") as final:
            json.dump(self.list, final, indent=2, ensure_ascii=False)

我正在使用带有 range() 函数的列表，因为在响应中（来自 scrapy shell 视图（response），分页按钮连接到脚本。我还尝试了 scrapy shell 的几个链接，xpaths 的输出工作，但是仍然，页面没有被刮掉。可能是什么问题？还有其他方法可以处理分页吗？

Answer 1

您的代码有很多问题，还有其他可以改进的地方。 请仔细阅读文档。

真的没有必要创建 xpath 属性。
您可以更短地编写 xpath 方式。
您可以从头开始创建start_urls 。
您可以让item exporter器处理 json。

这是一个示例，根据您的需要进行更改。

import scrapy


class ShopCard(scrapy.Item):
    item = scrapy.Field()


class SpidermaximaSpider(scrapy.Spider):
    name = 'spiderMaxima'
    allowed_domains = ['trobos.lt']
    start_urls = [f'https://trobos.lt/prekes?vendor=MAXIMA&page={i}' for i in range(1, 190)]
    items = []
    
    custom_settings = {
        'DOWNLOAD_DELAY': 0.4,
        'FEEDS': {
            'spiderMaxima.json': {
                'format': 'json',
                'indent': 2,
                }
        }
    }

    def parse(self, response):
        for url in response.xpath('//div[@class="card small"]//a[contains(@class, "shrink")]/@href').getall():
            yield response.follow(url=url, callback=self.parse_main_item)

    def parse_main_item(self, response):
        shop = ShopCard()
        Title = response.xpath('//h1/text()').get()
        Link = response.url
        Image = 'https://upload.wikimedia.org/wikipedia/commons/c/c1/Maxima_logo.svg'
        Price = response.xpath('//div[@class="price"]//span/text()').get()
        Price = Price.replace(',', '.')
        Price = float(Price.split(' ')[0])

        shop['item'] = {
            'title': Title,
            'link': Link,
            'image': Image,
            'price': Price
        }

        yield shop

Scrapy 不抓取从分页收集的链接

问题描述

1 个解决方案

解决方案1
0 2022-06-09 11:15:26

Scrapy 不抓取从分页收集的链接

问题描述

1 个解决方案

解决方案1 0 2022-06-09 11:15:26

解决方案1
0 2022-06-09 11:15:26