[英]Python Scrapy keep getting same page link from next page button
我正在嘗試將amazon.com的產品鏈接刪除,但具有800多個評論,但我不斷從下一頁按鈕獲得相同的頁面鏈接,因此它會一遍又一遍地返回第2頁,我應該在其中獲得第3,4頁和依此類推
我已經設置了條件,以匯總和比較“ STRING”(如1,020)大於或等於800的整數,然后基於訪問該頁面的條件進行比較
這是代碼
# -*- coding: utf-8 -*-
import scrapy
from amazon.items import AmazonItem
from urlparse import urljoin
class AmazonspiderSpider(scrapy.Spider):
name = "amazonspider"
DOWNLOAD_DELAY = 1
start_urls = ['https://www.amazon.com/s/ref=lp_165993011_nr_n_0?fst=as%3Aoff&rh=n%3A165793011%2Cn%3A%21165795011%2Cn%3A165993011%2Cn%3A2514571011&bbn=165993011&ie=UTF8&qid=1493778423&rnid=165993011']
def parse(self, response):
SET_SELECTOR = '.a-carousel-card.acswidget-carousel__card'
for attr in response.css(SET_SELECTOR):
#print '\n\n', attr
item = AmazonItem()
review_selector = './/*[@class="acs_product-rating__review-count"]/text()'
link_selector = './/*[@class="a-link-normal"]/@href'
if attr.xpath(review_selector).extract_first():
if int(''.join(attr.xpath(review_selector).extract_first().split(','))) >= 800:
url = urljoin(response.url, attr.xpath(link_selector).extract_first())
item['LINKS'] = url
if url:
yield scrapy.Request(url, callback=self.parse_link, meta={'item': item})
next_page = './/span[@class="pagnRA"]/a[@id="pagnNextLink"]/@href'
next_page = response.xpath(next_page).extract_first()
print '\n\n', urljoin(response.url, next_page)
if next_page:
yield scrapy.Request(
urljoin(response.url, next_page),
callback=self.parse
)
def parse_link(self, response):
item = AmazonItem(response.meta['item'])
catselector = '.cat-link ::text'
defaultcatselector = '.nav-search-label ::text'
cat = response.css(catselector).extract_first()
if cat:
item['CATAGORY'] = cat
else:
item['CATAGORY'] = response.css(defaultcatselector).extract_first()
return item
這是我遞歸調用解析函數之前打印下一頁鏈接時的輸出
將下一頁代碼塊移出循環。
class AmazonspiderSpider(scrapy.Spider):
name = "amazonspider"
DOWNLOAD_DELAY = 1
start_urls = ['https://www.amazon.com/s/ref=lp_165993011_nr_n_0?fst=as%3Aoff&rh=n%3A165793011%2Cn%3A%21165795011%2Cn%3A165993011%2Cn%3A2514571011&bbn=165993011&ie=UTF8&qid=1493778423&rnid=165993011']
def parse(self, response):
SET_SELECTOR = '.a-carousel-card.acswidget-carousel__card'
for attr in response.css(SET_SELECTOR):
#print '\n\n', attr
review_selector = './/*[@class="acs_product-rating__review-count"]/text()'
link_selector = './/*[@class="a-link-normal"]/@href'
if attr.xpath(review_selector).extract_first():
if int(''.join(attr.xpath(review_selector).extract_first().split(','))) >= 800:
url = urljoin(response.url, attr.xpath(link_selector).extract_first())
next_page = './/span[@class="pagnRA"]/a[@id="pagnNextLink"]/@href'
next_page = response.xpath(next_page).extract_first()
print '\n\n', urljoin(response.url, next_page)
if next_page:
yield scrapy.Request(
urljoin(response.url, next_page),
callback=self.parse
)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.