[英]Scrapy not scraping links gathered from pagination
我正在尝试为其产品抓取电子商务网站,但我目前面临的问题是并非所有通过分页获得的页面都被访问。 链接本身是有效的、可访问的,而不是不存在的。
我的蜘蛛代码:
import scrapy
import json
from pbl.items import ShopCard
class SpidermaximaSpider(scrapy.Spider):
name = 'spiderMaxima'
allowed_domains = ['www.trobos.lt']
start_urls = ['https://trobos.lt/prekes?vendor=MAXIMA']
item = []
list = [{
'sid': 10,
'name': 'Maxima',
'domain': 'hhttps://www.maxima.lt/',
'imageurl': 'https://upload.wikimedia.org/wikipedia/commons/c/c1/Maxima_logo.svg',
'product': item
}]
def __init__(self):
self.declare_xpath()
def declare_xpath(self):
self.getAllItemsXpath = '//*[@id="category"]/div/div[1]/div/div[3]/div[4]/div/div/div/div/div/a/@href'
self.TitleXpath = '//*[@id="product"]/section[1]/div[3]/section/div[2]/h1/text()'
self.PriceXpath = '//*[@id="product"]/section[1]/div[3]/section/div[2]/div[1]/div/div[1]/div/div[1]/span/text()'
def parse(self, response):
for href in response.xpath(self.getAllItemsXpath):
url = response.urljoin(href.extract())
yield scrapy.Request(url=url,callback=self.parse_main_item, dont_filter=True)
next_page = [response.url + '&page='+str(x) for x in range(1,193)]
for page in next_page:
print('-'* 100)
print(page)
print('-'* 100)
url = page
yield scrapy.Request(url, callback=self.parse)
def parse_main_item(self,response):
shop = ShopCard()
Title = response.xpath(self.TitleXpath).extract_first()
Link = response.url
Image = 'https://upload.wikimedia.org/wikipedia/commons/c/c1/Maxima_logo.svg'
Price = response.xpath(self.PriceXpath).extract_first()
Price = Price.replace(',', '.')
Price = float(Price.split(' ')[0])
shop['item'] = {
'title': Title,
'link': Link,
'image': Image,
'price': Price
}
self.item.append(shop['item'])
def closed(self, reason):
with open("spiderMaxima.json", "w") as final:
json.dump(self.list, final, indent=2, ensure_ascii=False)
我正在使用带有 range() 函数的列表,因为在响应中(来自 scrapy shell 视图(response),分页按钮连接到脚本。我还尝试了 scrapy shell 的几个链接,xpaths 的输出工作,但是仍然,页面没有被刮掉。可能是什么问题?还有其他方法可以处理分页吗?
您的代码有很多问题,还有其他可以改进的地方。 请仔细阅读文档。
start_urls
。item exporter
器处理 json。这是一个示例,根据您的需要进行更改。
import scrapy
class ShopCard(scrapy.Item):
item = scrapy.Field()
class SpidermaximaSpider(scrapy.Spider):
name = 'spiderMaxima'
allowed_domains = ['trobos.lt']
start_urls = [f'https://trobos.lt/prekes?vendor=MAXIMA&page={i}' for i in range(1, 190)]
items = []
custom_settings = {
'DOWNLOAD_DELAY': 0.4,
'FEEDS': {
'spiderMaxima.json': {
'format': 'json',
'indent': 2,
}
}
}
def parse(self, response):
for url in response.xpath('//div[@class="card small"]//a[contains(@class, "shrink")]/@href').getall():
yield response.follow(url=url, callback=self.parse_main_item)
def parse_main_item(self, response):
shop = ShopCard()
Title = response.xpath('//h1/text()').get()
Link = response.url
Image = 'https://upload.wikimedia.org/wikipedia/commons/c/c1/Maxima_logo.svg'
Price = response.xpath('//div[@class="price"]//span/text()').get()
Price = Price.replace(',', '.')
Price = float(Price.split(' ')[0])
shop['item'] = {
'title': Title,
'link': Link,
'image': Image,
'price': Price
}
yield shop
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.