Unable to go to next page with scrapy

Question

I'm trying to end a eshop scrapper thanks to Scrapy library. All work pretty good but I'm just not find the way to go to next page on ikea website (which i'm trying to scrape).

My actual code

import scrapy
import logging
from scrapy.crawler import CrawlerProcess
from scrapy.exporters import CsvItemExporter


class CsvPipeline(object):

    def __init__(self):
        self.file = open('ikeaSpiderSofa.tmp', 'wb')
        self.exporter = CsvItemExporter(self.file, str)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item


class ikeaSpider(scrapy.Spider):
    name = "ikeaSpider"

    start_urls = [
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=2',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=3',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=4',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=5',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=6',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=7',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=8',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=9',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=10',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=11',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=12',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=13',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=14',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=15',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=16',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=17',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=18',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=19',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=20',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=21',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=22',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=23',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=24',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=25',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=26',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=27',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=28',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=29',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=30',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=31',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=32',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=33',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=34',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=35',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=36',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=37',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=38',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=39',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=40',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=41',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=42',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=43',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=44',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=45',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=46',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=47',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=48',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=49',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=50',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=51',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=52',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=53',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=54',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=55',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=56',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=57',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=58',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=59',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=60',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=61',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=62',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=63',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=64',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=65',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=66',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=67',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=68',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=69',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=70',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=71',
        'https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=72'
    ]

    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.CsvPipeline': 1},  # Used for pipeline 1
        'FEED_FORMAT': 'csv',  # Used for pipeline 2
        'FEED_URI': 'ikeaSpiderSofa.csv'  # Used for pipeline 2

    }

    count = 1
    total = 70

    def parse(self, response):
        self.count += 1
        nexturl = "https://www.ikea.com/fr/fr/cat/canapes-fu003/?page=%d" % (self.count)

        for result in response.css('.range-revamp-product-compact__wrapper-link'):

            yield scrapy.Request(url=result.xpath('@href').extract_first(), callback=self.parse_detail)

            if self.count < self.total + 1:
                yield scrapy.Request(nexturl, self.parse)

    def parse_detail(self, response):
        label = response.css('.range-revamp-header-section__title--big.notranslate::text').get()
        price = response.css('.range-revamp-price__integer::text').get()
        description = response.css('.range-revamp-header-section__description-text::text').get()
        id_product = response.css('.range-revamp-product-identifier__value::text').get()
        arbo1 = response.css('#content > div > div.range-revamp-page-container__inner > div > div:nth-child(1) > div > nav > ol > li:nth-child(2) > a > span::text').get()
        arbo2 = response.css('#content > div > div.range-revamp-page-container__inner > div > div:nth-child(1) > div > nav > ol > li:nth-child(3) > a > span::text').get()
        arbo3 = response.css('#content > div > div.range-revamp-page-container__inner > div > div:nth-child(1) > div > nav > ol > li:nth-child(4) > a > span::text').get()
        arbo4 = response.css('#content > div > div.range-revamp-page-container__inner > div > div:nth-child(1) > div > nav > ol > li:nth-child(5) > a > span::text').get()
        arbo5 = response.css('#content > div > div.range-revamp-page-container__inner > div > div:nth-child(1) > div > nav > ol > li:nth-child(6) > a > span::text').get()
        producturl = response.selector.xpath('/html/head/meta[11]').get()

        yield {
            'producturl': producturl.strip(),
            'label': label.strip(),
            'price': price.strip(),
            'description': description.strip(),
            'id': id_product.strip(),
            'arbo1': arbo1.strip(),
            'arbo2': arbo2.strip(),
            'arbo3': arbo3.strip(),
            'arbo4': arbo4.strip(),
            'arbo5': arbo5.strip()
        }


process = CrawlerProcess(
    {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
)

process.crawl(ikeaSpider)
process.start()

As I know what all URLS looklike, I try to add them as base url, but it only give me the 24 firsts products of the page. I try to do it thanks to next_url, but it still doesn't work.

Anybody is able to help me to fix this ?

Thanks a lot

Answer 1

The website loads items dynamically when you click on the Show More button. Inspecting the network tab, you notice that there's an API call to https://sik.search.blue.cdtapps.com/fr/fr/product-list-page/more-products?category=fu003&sort=RELEVANCE&start=1&end=2&c=lf&v=20211021 which returns a json response.

The start and end values in the above url indicate the start and end product to return respectively. You can adjust the values as suits you.

I have also edited your code for exporting to csv. Since you aren't doing anything special with your custom CSV exporter you can simply use the FEEDS scrapy setting to get the same functionality.

See sample implementation below to return the first product only. Examine the json response. There are much more fields there which you can scrape.

import scrapy
import logging
from scrapy.crawler import CrawlerProcess


class ikeaSpider(scrapy.Spider):
    name = "ikeaSpider"
    start_product = 1
    end_product = 2

    start_urls = [
        f'https://sik.search.blue.cdtapps.com/fr/fr/product-list-page/more-products?category=fu003&sort=RELEVANCE&start={start_product}&end={end_product}&c=lf&v=20211021'
    ]

    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'FEEDS': {
            'ikeaSpiderSofa.csv': {
                'format': 'csv'
            }
        }
    }

    def parse(self, response):
        data = response.json()
        for product in data.get('moreProducts').get('productWindow'):
            yield {
                'producturl': product.get('pipUrl'),
                'label': product.get('name'),
                'price': product.get('price').get('wholeNumber'),
                'description': product.get('mainImageAlt'),
                'id': product.get('id'),
            }


process = CrawlerProcess(
    {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
     }
)
process.crawl(ikeaSpider)
process.start()

Unable to go to next page with scrapy

Question

1 answers

solution1
0 2021-10-29 03:56:13

Unable to go to next page with scrapy

Question

1 answers

solution1 0 2021-10-29 03:56:13

solution1
0 2021-10-29 03:56:13