[英]crawler is not crawling second start_url
I'm trying to scrape movie-reviews and tv-news from hindustantimes.com.我正在尝试从hindustantimes.com 中抓取电影评论和电视新闻。 When i'm running this code it is only scraping the first start_url but it's not able to scrape the second start_url.
当我运行此代码时,它只会抓取第一个 start_url,但无法抓取第二个 start_url。 I think the counter has to be reset but I'm not able to find out how.
我认为必须重置计数器,但我不知道如何重置。 I want to scrape the n numbers of pages from both the start_url.
我想从 start_url 中抓取 n 个页面。
import scrapy
#test_push
from..items import HindustantimesItem
class HindustantimesSpider(scrapy.Spider):
name = 'Hindustantimes_review'
page_number = 2
count = 0
def start_requests(self):
urls = ['https://www.hindustantimes.com/movie-reviews/page/?pageno={}',
'https://www.hindustantimes.com/tv/page/?pageno={}',
]
ur = []
for url in urls:
for i in range(1,3):
x = url.format(i)
yield scrapy.Request(url=x, callback=self.parse,)
def parse(self, response):
print("-------^^^^^^---------")
print(response.request.url)
items = {}
with open('output.txt', 'a') as the_file:
the_file.write(response.request.url)
the_file.write( "\n")
title_xpath = ['//*[@id="scroll-container"]/ul/li[{}]/div/div[2]/div/a/text()', '/html/body/div[1]/section/div[2]/div/div[1]/div[2]/ul/li[{}]/div/div[2]/div[1]/a/text()']
page_review_xpath = ['//*[@id="scroll-container"]/ul/li[{}]/div/div[2]/p/text()','/html/body/div[1]/section/div[2]/div/div[1]/div[2]/ul/li[{}]/div/div[2]/div[2]/text()']
page_link_xpath = ['//*[@id="scroll-container"]/ul/li[{}]/div/div[2]/div/a/@href', '/html/body/div[1]/section/div[2]/div/div[1]/div[2]/ul/li[{}]/div/div[2]/div[1]/a/@href']
if HindustantimesSpider.count ==0:
current_title_xpath = title_xpath[0]
current_review_xpath = page_review_xpath[0]
current_link_xpath = page_link_xpath[0]
HindustantimesSpider.count+=1
else:
current_title_xpath = title_xpath[1]
current_review_xpath = page_review_xpath[1]
current_link_xpath = page_link_xpath[1]
count = response.xpath(current_title_xpath.format("*")).getall()
count = len(count)
i = 1
while i<=count:
outputs = HindustantimesItem()
outputs['page_title'] = response.xpath(current_title_xpath.format(i)).get()
outputs['review_content'] = response.xpath(current_review_xpath.format(i)).get()
outputs['review_link'] = response.xpath(current_link_xpath.format(i)).get()
i+=1
fl = 0
if outputs['page_title'] == []:
outputs['page_title'] = ''
fl+=1
if outputs['review_content'] == []:
outputs['review_content'] = ''
fl+=1
if outputs['review_link'] == []:
outputs['review_link'] = ''
fl += 1
else:
if 'Review:' in outputs['page_title'].split(" ") or 'review:' in outputs['page_title'].split(" "):
yield outputs
pass
def start_requests(self):
urls = ['https://www.hindustantimes.com/movie-reviews/page/?pageno={}',
'https://www.hindustantimes.com/tv/page/?pageno={}',
]
ur = []
for url in urls:
for i in range(1,3):
x = url.format(i)
yield scrapy.Request(url=x, callback=self.parse,dont_filter=True)
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.