Go to next page on showthread.php with scrapy

Question

I'm new to scrapy. For about 4 days I'm stuck at go to next page when fetching showthread.php (forum based on vbulletin).

My target: http://forum.femaledaily.com/showthread.php?359-Hair-Smoothing

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

from femaledaily.items import FemaledailyItem

class Femaledaily(scrapy.Spider):
    name = "femaledaily"
    allowed_domains = ["femaledaily.com"]
    start_urls = [
        "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care",
        "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page2",
        "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page3",
        "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page4",
    ]

    def parse(self, response):
        for thd in response.css("tbody > tr "):
            print "==========NEW THREAD======"
            url = thd.xpath('.//div[@class="threadlist-title"]/a/@href').extract()
            url[0] = "http://forum.femaledaily.com/"+url[0]
            print url[0]
            yield scrapy.Request(url[0], callback=self.parse_thread)

    def parse_thread(self, response):
        for page in response.xpath('//ol[@id="posts"]/li'):
            item = FemaledailyItem()
            item['thread_title'] = response.selector.xpath('//span[@class="threadtitle"]/a/text()').extract()
            # item['thread_starter'] = response.selector.xpath('//div[@class="username_container"]/a/text()').extract_first()
            post_creator = page.xpath('.//div[@class="username_container"]/a/text()').extract()

            if not post_creator:
                item['post_creator'] = page.xpath('.//div[@class="username_container"]/a/span/text()').extract()
            else:
                item['post_creator'] = post_creator

            item['post_content'] = ""

            cot = page.xpath(".//blockquote[@class='postcontent restore ']/text()").extract()
            for ct in cot:
                item['post_content'] += ct.replace('\t','').replace('\n','')

            yield item

I'm able to get first 10 posts for every thread, but I'm confused how to go to next page. Any ideas?

Answer 1

A slight change made in your code so that it will paginate properly,

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

from femaledaily.items import FemaledailyItem

class Femaledaily(scrapy.Spider):
    name = "femaledaily"
    allowed_domains = ["femaledaily.com"]
    BASE_URL = "http://forum.femaledaily.com/"
    start_urls = [
        "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care",
        "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page2",
        "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page3",
        "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page4",
    ]

    def parse(self, response):
        for thd in response.css("tbody > tr "):
            print "==========NEW THREAD======"
            url = thd.xpath('.//div[@class="threadlist-title"]/a/@href').extract()
            url = "http://forum.femaledaily.com/"+url[0]
            yield scrapy.Request(url, callback=self.parse_thread)

        # pagination
        next_page = response.xpath('//li[@class="prev_next"]/a[@rel="next"]/@href').extract()
        if next_page:
            yield Request(self.BASE_URL  + next_page[0], callback=self.parse)
        else:
            return

    def parse_thread(self, response):
        for page in response.xpath('//ol[@id="posts"]/li'):
            item = FemaledailyItem()
            item['thread_title'] = response.selector.xpath('//span[@class="threadtitle"]/a/text()').extract()
            # item['thread_starter'] = response.selector.xpath('//div[@class="username_container"]/a/text()').extract_first()
            post_creator = page.xpath('.//div[@class="username_container"]/a/text()').extract()

            if not post_creator:
                item['post_creator'] = page.xpath('.//div[@class="username_container"]/a/span/text()').extract()
            else:
                item['post_creator'] = post_creator

            item['post_content'] = ""

            cot = page.xpath(".//blockquote[@class='postcontent restore ']/text()").extract()
            for ct in cot:
                item['post_content'] += ct.replace('\t','').replace('\n','')

            yield item

        # pagination   
        next_page = response.xpath('//li[@class="prev_next"]/a[@rel="next"]/@href').extract()
        if next_page:
            yield Request(self.BASE_URL  + next_page[0], callback=self.parse_thread)
        else:
            return

Here first extract the next page's link (ie, single forward arrow) and giving a request to that next_page_url and make the callback function as the same function from where it is called. When it reaches the last page the next-page-url vanishes and halts.

Go to next page on showthread.php with scrapy

Question

1 answers

solution1
1 ACCPTED 2015-07-01 07:14:19

Go to next page on showthread.php with scrapy

Question

1 answers

solution1 1 ACCPTED 2015-07-01 07:14:19

solution1
1 ACCPTED 2015-07-01 07:14:19