[英]Scrapy returns only the first product for each page
我尝试学习 Scrapy 但我被问题困住了 3 天,也许你们中的一些人可以帮助我提供解决方案或建议。 我尝试从站点的一个类别中提取所有产品,对于每个产品,我只需要 3 个类别:设置所有产品时主页上的 2 个类别,以及产品详细信息中的 1 个类别(这是产品鳕鱼); 为此,我访问了每个产品的链接。 所有提取的产品 go 到 ItemLoader。 在项目文件中,我为所有项目使用了 MapComposer 和 TakeFirst 处理器。
问题是我的代码只从每个页面中提取第一个产品。
这是代码:
import os
import scrapy
from scrapy.loader import ItemLoader
from ..items import CutotulItem
class CutotulSpiderLoader(scrapy.Spider):
name = 'cutotul_spider_loader'
start_urls = ['https://cutotul.ro/39-karcher-aspiratoare-profesionale']
def __init__(self):
self.model = ""
def start_requests(self):
yield scrapy.Request('https://cutotul.ro/39-karcher-aspiratoare-profesionale', callback=self.parse)
def parse(self, response):
products = response.css("div.columns-container")
for product in products:
# get details link
details_link = product.xpath("//a[@class='lnk_view btn btn-default']/@href").get()
# get details
yield response.follow(url=details_link, callback=self.parse_details)
product_name_xpath = "//span[@class='grid-name']/text()"
product_price_xpath = "//span[@class='price product-price']/text()"
product_model_xpath = "".join(self.model)
# loader
loader = ItemLoader(item=CutotulItem(), selector=product, response=response)
loader.add_xpath("product_name", product_name_xpath)
loader.add_xpath("product_price", product_price_xpath)
loader.add_value("product_model", product_model_xpath)
yield loader.load_item()
# nav to next page
# Get the next response for x items from the next page - persist until no more #
next_page = response.xpath("//li[@class='pagination_next']//@href").get()
if next_page:
yield response.follow(url=next_page, callback=self.parse)
def parse_details(self, response):
# set variable to response for model
self.model = response.css("span[itemprop='sku']").css("::text").get()
我怎么解决这个问题?
非常感谢!
.//a
而不是//a
)。products
只是这些项目的容器。parse_details
不起作用 - scrapy 是异步的,因此它不会等待self.model
更新,您将得到一个空字符串。 我将其作为示例进行了修复,但您可以根据需要进行修复。import os
import scrapy
from scrapy.loader import ItemLoader
# from ..items import CutotulItem
class CutotulItem(scrapy.Item):
product_name = scrapy.Field()
product_price = scrapy.Field()
product_model = scrapy.Field()
class CutotulSpiderLoader(scrapy.Spider):
name = 'cutotul_spider_loader'
start_urls = ['https://cutotul.ro/39-karcher-aspiratoare-profesionale']
def __init__(self):
self.model = ""
def start_requests(self):
yield scrapy.Request('https://cutotul.ro/39-karcher-aspiratoare-profesionale', callback=self.parse)
async def parse(self, response):
# products = response.css("div.columns-container")
products = response.css('div.product-container')
for product in products:
# get details link
details_link = product.xpath(".//a[@class='lnk_view btn btn-default']/@href").get()
# get details
# yield response.follow(url=details_link, callback=self.parse_details)
req = response.follow(url=details_link)
resp = await self.crawler.engine.download(req, self)
self.model = resp.css("span[itemprop='sku']").css("::text").get()
product_name_xpath = ".//span[@class='grid-name']/text()"
product_price_xpath = ".//span[@class='price product-price']/text()"
product_model_xpath = "".join(self.model)
# loader
loader = ItemLoader(item=CutotulItem(), selector=product, response=response)
loader.add_xpath("product_name", product_name_xpath)
loader.add_xpath("product_price", product_price_xpath)
loader.add_value("product_model", product_model_xpath)
yield loader.load_item()
# nav to next page
# Get the next response for x items from the next page - persist until no more #
next_page = response.xpath("//li[@class='pagination_next']//@href").get()
if next_page:
yield response.follow(url=next_page, callback=self.parse)
def parse_details(self, response):
# set variable to response for model
self.model = response.css("span[itemprop='sku']").css("::text").get()
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.