简体   繁体   中英

crawler is not crawling second start_url

I'm trying to scrape movie-reviews and tv-news from hindustantimes.com. When i'm running this code it is only scraping the first start_url but it's not able to scrape the second start_url. I think the counter has to be reset but I'm not able to find out how. I want to scrape the n numbers of pages from both the start_url.

import scrapy
#test_push
from..items import HindustantimesItem

class HindustantimesSpider(scrapy.Spider):
    name = 'Hindustantimes_review'
    page_number = 2
    count = 0

    def start_requests(self): 

        urls = ['https://www.hindustantimes.com/movie-reviews/page/?pageno={}',
        'https://www.hindustantimes.com/tv/page/?pageno={}',
        ]
        ur = []
        for url in urls:
            for i in range(1,3):
                x = url.format(i)
                yield scrapy.Request(url=x, callback=self.parse,)

    def parse(self, response):
        print("-------^^^^^^---------")
        print(response.request.url)
        items = {}


        with open('output.txt', 'a') as the_file:
            the_file.write(response.request.url)
            the_file.write( "\n")

        title_xpath = ['//*[@id="scroll-container"]/ul/li[{}]/div/div[2]/div/a/text()', '/html/body/div[1]/section/div[2]/div/div[1]/div[2]/ul/li[{}]/div/div[2]/div[1]/a/text()']
        page_review_xpath = ['//*[@id="scroll-container"]/ul/li[{}]/div/div[2]/p/text()','/html/body/div[1]/section/div[2]/div/div[1]/div[2]/ul/li[{}]/div/div[2]/div[2]/text()']
        page_link_xpath = ['//*[@id="scroll-container"]/ul/li[{}]/div/div[2]/div/a/@href', '/html/body/div[1]/section/div[2]/div/div[1]/div[2]/ul/li[{}]/div/div[2]/div[1]/a/@href']
        if HindustantimesSpider.count ==0:
            current_title_xpath = title_xpath[0]
            current_review_xpath = page_review_xpath[0]
            current_link_xpath = page_link_xpath[0]
            HindustantimesSpider.count+=1
        else:
            current_title_xpath = title_xpath[1]
            current_review_xpath = page_review_xpath[1]
            current_link_xpath = page_link_xpath[1]

        count = response.xpath(current_title_xpath.format("*")).getall()
        count = len(count)
        i = 1
        while i<=count:
            outputs = HindustantimesItem()
            outputs['page_title'] = response.xpath(current_title_xpath.format(i)).get()
            outputs['review_content'] = response.xpath(current_review_xpath.format(i)).get()
            outputs['review_link'] = response.xpath(current_link_xpath.format(i)).get()
            i+=1
            fl = 0
            if outputs['page_title'] == []:
                outputs['page_title'] = ''
                fl+=1
            if outputs['review_content'] == []:
                outputs['review_content'] = ''
                fl+=1
            if outputs['review_link'] == []:
                outputs['review_link'] = ''
                fl += 1
            else:
                if 'Review:' in outputs['page_title'].split(" ") or 'review:' in outputs['page_title'].split(" "):
                    yield outputs

        pass
def start_requests(self): 

        urls = ['https://www.hindustantimes.com/movie-reviews/page/?pageno={}',
        'https://www.hindustantimes.com/tv/page/?pageno={}',
        ]
        ur = []
        for url in urls:
            for i in range(1,3):
                x = url.format(i)
                yield scrapy.Request(url=x, callback=self.parse,dont_filter=True)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM