簡體   English   中英

使用 scrapy 從預訂網站抓取,文件 csv 為空

[英]Scraping from booking website using scrapy, the file csv is empty

我正在嘗試使用 python 中的庫 scrapy 從預訂網站上抓取第一頁中顯示的酒店名稱,但我得到了一個空文件 csv,它只包含列的名稱,任何建議! 謝謝

這是 python 代碼:

import scrapy
import logging
from scrapy.crawler import CrawlerProcess
from scrapy.exporters import CsvItemExporter

class CsvPipeline(object):
    def __init__(self):
        self.file = open ('duproprio.tmp','wb')
        self.exporter = CsvItemExporter(self.file,str)
        self.exporter.start_exporting()
    def close_spider(self,spider):
        self.exporter.finish_exporting()
        self.file.close()
    def process_items(self,item,spider):
        self.exporter.export_item(item)
        return item

class DuProprioSpider(scrapy.Spider):
    name = "booking"
    start_urls = [
        "https://www.booking.com/searchresults.fr.html?label=gen173nr-1DCAEoggI46AdIM1gEaIwBiAEBmAENuAEXyAEP2AED6AEBiAIBqAIDuALsycKNBsACAdICJGE1YmJmNDE1LWU2ZTEtNGEzMy05MTcyLThkYmQ2OGI5NWE5OdgCBOACAQ&sid=2e5b4623e13363b5ec7de2d7957c8c22&sb=1&sb_lp=1&src=theme_landing_index&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Fhotel%2Findex.fr.html%3Flabel%3Dgen173nr-1DCAEoggI46AdIM1gEaIwBiAEBmAENuAEXyAEP2AED6AEBiAIBqAIDuALsycKNBsACAdICJGE1YmJmNDE1LWU2ZTEtNGEzMy05MTcyLThkYmQ2OGI5NWE5OdgCBOACAQ%3Bsid%3D2e5b4623e13363b5ec7de2d7957c8c22%3B&ss=Maroc&is_ski_area=&checkin_year=&checkin_month=&checkout_year=&checkout_month=&group_adults=2&group_children=0&no_rooms=1&b_h4u_keep_filters=&from_sf=1&ss_raw=ma&ac_position=1&ac_langcode=fr&ac_click_type=b&dest_id=143&dest_type=country&place_id_lat=32.4281&place_id_lon=-6.92197&search_pageview_id=7ca057bb44b9012d&search_selected=true&search_pageview_id=7ca057bb44b9012d&ac_suggestion_list_length=5&ac_suggestion_theme_list_length=0"]
    
    custom_settings = {
        'LOG_LEVEL':logging.WARNING,
        'ITEM_PIPELINES':{'__main__.CsvPipeline':1},
        'FEED_FORMAT':'csv',
        'FEED_URI':'bookingresult.csv'
        }
    
    #count = 0
    #total = 25
    
    def parse(self,response):
        #self.count =+25
        nexturl = "https://www.booking.com/searchresults.fr.html?label=gog235jc-1DCAIojAFCAm1hSA1YA2iMAYgBAZgBDbgBF8gBD9gBA-gBAfgBAogCAagCA7gCj9q5jQbAAgHSAiQ1MDlhN2M0Ny0yMmYwLTRiNDUtYjNhMC0xY2Y1MTg3NWM5ODfYAgTgAgE&sid=2e5b4623e13363b5ec7de2d7957c8c22&aid=356980&dest_id=-38833&dest_type=city&srpvid=00bd4bf5ca01008f&track_hp_back_button=1&nflt=ht_id%3D204&offset=0"
        for i in response.css('div._814193827>div>div>div>div>a'):
            yield scrapy.Request(url=i.xpath('@href').extract_first(),callback = self.parse_detail) 
        #if self.count < self.total+25:
        yield scrapy.Request(nexturl,self.parse)
    
    
    def parse_detail(self,response):
        nom_hotel = response.css('h2#hp_hotel_name.hp__hotel-name::text').get()
        
        yield{
            'nom_hotel' : nom_hotel.strip()
            }
        
process = CrawlerProcess(
    {
     'USER_AGENT':'Mozilla/4.0 (comatible;MSIE 7.0;Window NT 5.1)'
     })
process.crawl(DuProprioSpider)
process.start()

1.第一個結果是'\n'。 get all 的解決方案示例:

def parse_detail(self, response):
        nom_hotel = response.css('h2#hp_hotel_name.hp__hotel-name::text').getall()
        nom_hotel = ''.join(nom_hotel)
        yield{
            'nom_hotel': nom_hotel.strip()
        }

Output:

nom_hotel
Camp Sahara Holidays
Lovely House at La perle de Cabo Negro
Riad Dar Salam
Hôtel Auberge du Littoral
Kasbah Sirocco
...
...
...

2.你的管道是錯誤的,所以你會在很多空行之后在文件末尾得到結果。 或者只使用默認導出器:

    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'FEED_EXPORTERS': {'csv': 'scrapy.exporters.CsvItemExporter'},
        'FEED_FORMAT': 'csv',
        'FEED_URI': 'bookingresult.csv'
    }

3.你不必輸入每個頁面只是為了得到名字,你可以從搜索結果頁面中抓取它。 例子:

    def parse(self, response):
        nexturl = "https://www.booking.com/searchresults.fr.html?label=gog235jc-1DCAIojAFCAm1hSA1YA2iMAYgBAZgBDbgBF8gBD9gBA-gBAfgBAogCAagCA7gCj9q5jQbAAgHSAiQ1MDlhN2M0Ny0yMmYwLTRiNDUtYjNhMC0xY2Y1MTg3NWM5ODfYAgTgAgE&sid=2e5b4623e13363b5ec7de2d7957c8c22&aid=356980&dest_id=-38833&dest_type=city&srpvid=00bd4bf5ca01008f&track_hp_back_button=1&nflt=ht_id%3D204&offset=0"
        all_names = response.xpath('//div[@data-testid="title"]/text()').getall()
        for name in all_names:
            yield {'nom_hotel': name}

它更快,因為您只需創建 1 個請求,而不是 26 個請求(第一個加上 25 個搜索結果)。

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM