I am trying to automatically generate a CSV every time I execute Scrapy crawl (myspider). I have tried to use Scrapy feeds, but I am getting the following error:
error.ReactorNotRestartable() twisted.internet.error.ReactorNotRestartable
This is my code:
import scrapy
import json
from ..items import inmobiliarias
from scrapy import Spider
from scrapy.selector import Selector
class IdealistaSpider(scrapy.Spider):
name = 'idealista'
custom_setting = {'FEEDS': {'/Users/aleja/Documentos OneDrive/items.csv':{'format': 'csv'}}}
def start_requests(self):
url = 'https://www.idealista.com/areas/venta-viviendas/pagina-{page}?shape=%28%28osyuFrkkUqPo%60%40rEaGxm%40UA%7ERRnInApEgUlBkG%7DCiGtE%29%29'
headers = {
"authority": "www.idealista.com",
"cache-control": "max-age=0",
"sec-ch-ua": "\"Chromium\";v=\"94\", \"Google Chrome\";v=\"94\", \";Not A Brand\";v=\"99\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"sec-fetch-site": "none",
"sec-fetch-mode": "navigate",
"sec-fetch-user": "?1",
"sec-fetch-dest": "document",
"accept-language": "es-ES,es;q=0.9"
}
for page in range(1,5):
yield scrapy.Request(url = url.format(page = page), headers = headers, callback = self.parse_json)
def parse_json(self, response):
#from scrapy.shell import inspect_response
#inspect_response(response, self)
sel = Selector(text= response.text)
item = sel.xpath('//div[@class="item-info-container"]').extract()
items = inmobiliarias()
for element in item:
sel = Selector(text = element)
items['title'] = sel.xpath('//a[@role="heading"]/@title').extract()
items['price'] = sel.xpath('//div/span[@class="item-price h2-simulated"]/text()').extract()
items['phone'] = sel.xpath('//span[@class="icon-phone item-not-clickable-phone"]/text()').extract()
items['url'] = 'https://www.idealista.com' + sel.xpath('//a[@role="heading"]/@href')[0].extract()
items['meters'] = sel.xpath('//span[small/text()="m²"]/text()').extract()
items['rooms'] = sel.xpath('//span[small/text()="hab."]/text()').extract()
items['real_state'] = sel.xpath('//picture[@class="logo-branding"]/a//@alt').extract()
items['garage'] = sel.xpath('//div/span[@class="item-parking"]/text()').extract()
last_item_detail = sel.xpath('//span[@class="item-detail"]/text()')[-1].extract()
if last_item_detail != items['meters'] and last_item_detail != items['rooms']:
items['floor'] = last_item_detail
else:
items['floor'] = ''
yield items
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.