[英]Scrapy spider scrape only 2 pages
当我运行这段代码时,蜘蛛只爬行了 3 页并停止。 它不会转到下一页。
我尝试了不同的方式去改变和改变,但我无法移动到第三页。
# -*- coding: utf-8 -*-
import scrapy
from realstatedata.items import RealstatedataItem
class RsdataSpider(scrapy.Spider):
name = 'rsdata'
allowed_domains = ['vivareal.com.br']
start_urls = ['https://www.vivareal.com.br/aluguel/sp/sao-jose-dos-campos/apartamento_residencial/#preco-ate=2000']
def parse(self, response):
nextpageurl = response.xpath('//a[@title="Próxima página"]/@href')
yield from self.scrape(response)
if nextpageurl:
path = nextpageurl.extract_first()
nextpage = response.urljoin(path)
print("Found url: {}".format(nextpage))
yield scrapy.Request(nextpage, callback=self.parse)
def scrape(self, response):
for resource in response.xpath('//article[@class="property-card__container js-property-card"]/..'):
item = RealstatedataItem()
item['description'] = resource.xpath('.//h2/span[@class="property-card__title js-cardLink js-card-title"]/text()').extract_first()
#item['address'] = resource.xpath('.//span[@class="property-card__address"]/text()').extract_first()
#item['prop_area'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area"]/text()').extract_first()
#item['prop_rooms'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
#item['prop_bath'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
#item['prop_parking'] = resource.xpath('.//ul/li[4]/span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
#item['price_rent'] = resource.xpath('.//p[@style="display: block;"]/text()').extract_first()
#item['price_cond'] = resource.xpath('.//strong[@class="js-condo-rice"]/text()').extract_first()
#item['realstate_name'] = resource.xpath('.//picture/img/@alt').extract_first()
yield item
用。。。来代替 '?' 在“路径”中(注意“下一页”按钮不起作用):
import scrapy
from realstatedata.items import RealstatedataItem
class RsdataSpider(scrapy.Spider):
name = 'rsdata'
allowed_domains = ['vivareal.com.br']
start_urls = ['https://www.vivareal.com.br/aluguel/sp/sao-jose-dos-campos/apartamento_residencial/#preco-ate=2000']
def parse(self, response):
nextpageurl = response.xpath('//a[contains(@title,"Próxima página")]/@href')
yield from self.scrape(response)
if nextpageurl:
path = nextpageurl.extract_first()
# Got #pagina=2 => Replace with ?pagina=2
path = '?' + path[1:]
nextpage = response.urljoin(path)
print("Found url: {}".format(nextpage))
yield scrapy.Request(nextpage)
def scrape(self, response):
for resource in response.xpath('//article[@class="property-card__container js-property-card"]/..'):
item = RealstatedataItem()
item['description'] = resource.xpath('.//h2/span[@class="property-card__title js-cardLink js-card-title"]/text()').extract_first()
#item['address'] = resource.xpath('.//span[@class="property-card__address"]/text()').extract_first()
#item['prop_area'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area"]/text()').extract_first()
#item['prop_rooms'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
#item['prop_bath'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
#item['prop_parking'] = resource.xpath('.//ul/li[4]/span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
#item['price_rent'] = resource.xpath('.//p[@style="display: block;"]/text()').extract_first()
#item['price_cond'] = resource.xpath('.//strong[@class="js-condo-rice"]/text()').extract_first()
#item['realstate_name'] = resource.xpath('.//picture/img/@alt').extract_first()
yield item
部分输出:
{'description': ' Apartamento com 2 Quartos para Aluguel, 82m² '}
{'description': ' Apartamento com 4 Quartos à Venda/Aluguel 280m² '}
{'description': ' Apartamento com 2 Quartos para Aluguel, 70m² '}
{'description': ' Apartamento com 3 Quartos para Aluguel, 113m² '}
{'description': ' Apartamento com 2 Quartos para Venda/Aluguel 50m² '}
{'description': ' Apartamento com 2 Quartos para Venda/Aluguel 50m² '}
Found url: https://www.vivareal.com.br/aluguel/sp/sao-jose-dos-campos/apartamento_residencial/?pagina=27
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.