yield scrapy.Request() not working properly for crawling next page

Question

The same code works for the different site but not with this one!

It doesn't give any error and successfully craws 8 pages (or count pages) import scrapy

    count = 8

    class QuotesSpiderSpider(scrapy.Spider):
        name = 'quotes_spider'
        allowed_domains = ['quotes.toscrape.com']
        start_urls = ['http://quotes.toscrape.com/']

        def parse(self, response):
            quotes = response.xpath('//*[@class="quote"]')

            for quote in quotes:
                text = quote.xpath('.//*[@class="text"]/text()').extract_first()
                author = quote.xpath('.//*[@class="author"]/text()').extract_first()

                yield{
                    'Text' : text,
                    'Author' : author
                }

            global count
            count = count - 1
            if(count > 0):
                next_page = response.xpath('//*[@class="next"]/a/@href').extract_first()
                absolute_next_page = response.urljoin(next_page)
                yield scrapy.Request(absolute_next_page)

But it crawls only 1st page for this site

site https://www.goodreads.com/list/show/7

# -*- coding: utf-8 -*-
import scrapy

count = 5

class BooksSpider(scrapy.Spider):
    name = 'books'
    allowed_domains = ["goodreads.com/list/show/7"]
    start_urls = ["https://goodreads.com/list/show/7/"]

    def parse(self, response):
        books = response.xpath('//tr/td[3]')

        for book in books:
            bookTitle = book.xpath('.//*[@class="bookTitle"]/span/text()').extract_first()
            authorName = book.xpath('.//*[@class="authorName"]/span/text()').extract_first()

            yield{
                'BookTitle' : bookTitle,
                'AuthorName' : authorName
            }

        global count
        count = count - 1

        if (count > 0):
            next_page_url = response.xpath('//*[@class="pagination"]/a[@class="next_page"]/@href').extract_first()
            absolute_next_page_url = response.urljoin(next_page_url)
            yield scrapy.Request(url = absolute_next_page_url)

I want to crawl certain limited pages or all pages of 2nd site.

Answer 1

You are using a domain with path in allowed_domains .

allowed_domains = ["goodreads.com/list/show/7"]

should be

allowed_domains = ["goodreads.com"]

yield scrapy.Request() not working properly for crawling next page

Question

1 answers

solution1
1 ACCPTED 2018-03-26 11:58:24

yield scrapy.Request() not working properly for crawling next page

Question

1 answers

solution1 1 ACCPTED 2018-03-26 11:58:24

solution1
1 ACCPTED 2018-03-26 11:58:24