无法使用scrapy转到下一页

Question

由于 Scrapy 库，我正在尝试结束电子商店剪贴板。 一切都很好，但我只是找不到进入宜家网站（我正在尝试抓取）上的下一页的方法。

我的实际代码

import scrapy
import logging
from scrapy.crawler import CrawlerProcess
from scrapy.exporters import CsvItemExporter


class CsvPipeline(object):

    def __init__(self):
        self.file = open('ikeaSpiderSofa.tmp', 'wb')
        self.exporter = CsvItemExporter(self.file, str)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item


class ikeaSpider(scrapy.Spider):
    name = "ikeaSpider"

    start_urls = [
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=2',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=3',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=4',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=5',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=6',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=7',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=8',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=9',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=10',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=11',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=12',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=13',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=14',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=15',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=16',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=17',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=18',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=19',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=20',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=21',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=22',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=23',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=24',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=25',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=26',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=27',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=28',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=29',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=30',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=31',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=32',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=33',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=34',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=35',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=36',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=37',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=38',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=39',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=40',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=41',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=42',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=43',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=44',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=45',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=46',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=47',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=48',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=49',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=50',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=51',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=52',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=53',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=54',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=55',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=56',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=57',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=58',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=59',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=60',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=61',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=62',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=63',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=64',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=65',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=66',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=67',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=68',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=69',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=70',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=71',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=72'
    ]

    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.CsvPipeline': 1},  # Used for pipeline 1
        'FEED_FORMAT': 'csv',  # Used for pipeline 2
        'FEED_URI': 'ikeaSpiderSofa.csv'  # Used for pipeline 2

    }

    count = 1
    total = 70

    def parse(self, response):
        self.count += 1
        nexturl = "https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=%d" % (self.count)

        for result in response.css('.range-revamp-product-compact__wrapper-link'):

            yield scrapy.Request(url=result.xpath('@href').extract_first(), callback=self.parse_detail)

            if self.count < self.total + 1:
                yield scrapy.Request(nexturl, self.parse)

    def parse_detail(self, response):
        label = response.css('.range-revamp-header-section__title--big.notranslate::text').get()
        price = response.css('.range-revamp-price__integer::text').get()
        description = response.css('.range-revamp-header-section__description-text::text').get()
        id_product = response.css('.range-revamp-product-identifier__value::text').get()
        arbo1 = response.css('#content > div > div.range-revamp-page-container__inner > div > div:nth-child(1) > div > nav > ol > li:nth-child(2) > a > span::text').get()
        arbo2 = response.css('#content > div > div.range-revamp-page-container__inner > div > div:nth-child(1) > div > nav > ol > li:nth-child(3) > a > span::text').get()
        arbo3 = response.css('#content > div > div.range-revamp-page-container__inner > div > div:nth-child(1) > div > nav > ol > li:nth-child(4) > a > span::text').get()
        arbo4 = response.css('#content > div > div.range-revamp-page-container__inner > div > div:nth-child(1) > div > nav > ol > li:nth-child(5) > a > span::text').get()
        arbo5 = response.css('#content > div > div.range-revamp-page-container__inner > div > div:nth-child(1) > div > nav > ol > li:nth-child(6) > a > span::text').get()
        producturl = response.selector.xpath('/html/head/meta[11]').get()

        yield {
            'producturl': producturl.strip(),
            'label': label.strip(),
            'price': price.strip(),
            'description': description.strip(),
            'id': id_product.strip(),
            'arbo1': arbo1.strip(),
            'arbo2': arbo2.strip(),
            'arbo3': arbo3.strip(),
            'arbo4': arbo4.strip(),
            'arbo5': arbo5.strip()
        }


process = CrawlerProcess(
    {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
)

process.crawl(ikeaSpider)
process.start()

正如我知道所有 URL 的样子，我尝试将它们添加为基本 URL，但它只给我页面的 24 个第一产品。 由于next_url，我尝试这样做，但它仍然不起作用。

有人能帮我解决这个问题吗？

非常感谢

Answer 1

当您单击“显示更多”按钮时，网站会动态加载项目。 检查网络选项卡，您会注意到有一个对https://sik.search.blue.cdtapps.com/fr/fr/product-list-page/more-products?category=fu003&sort=RELEVANCE&start=1&end=2&c=lf&v=20211021的 API 调用https://sik.search.blue.cdtapps.com/fr/fr/product-list-page/more-products?category=fu003&sort=RELEVANCE&start=1&end=2&c=lf&v=20211021返回一个 json 响应。

上述 url 中的start和end值分别表示要返回的开始和结束产品。 您可以根据需要调整这些值。

我还编辑了您导出到 csv 的代码。 由于您没有对自定义 CSV 导出器执行任何特殊操作，因此您只需使用FEEDS scrapy 设置即可获得相同的功能。

请参阅下面的示例实现以仅返回第一个产品。 检查 json 响应。 那里有更多的领域，你可以刮。

import scrapy
import logging
from scrapy.crawler import CrawlerProcess


class ikeaSpider(scrapy.Spider):
    name = "ikeaSpider"
    start_product = 1
    end_product = 2

    start_urls = [
        f'https://sik.search.blue.cdtapps.com/fr/fr/product-list-page/more-products?category=fu003&sort=RELEVANCE&start={start_product}&end={end_product}&c=lf&v=20211021'
    ]

    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'FEEDS': {
            'ikeaSpiderSofa.csv': {
                'format': 'csv'
            }
        }
    }

    def parse(self, response):
        data = response.json()
        for product in data.get('moreProducts').get('productWindow'):
            yield {
                'producturl': product.get('pipUrl'),
                'label': product.get('name'),
                'price': product.get('price').get('wholeNumber'),
                'description': product.get('mainImageAlt'),
                'id': product.get('id'),
            }


process = CrawlerProcess(
    {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
     }
)
process.crawl(ikeaSpider)
process.start()

无法使用scrapy转到下一页

问题描述

1 个解决方案

解决方案1
0 2021-10-29 03:56:13

无法使用scrapy转到下一页

问题描述

1 个解决方案

解决方案1 0 2021-10-29 03:56:13

解决方案1
0 2021-10-29 03:56:13