简体   繁体   中英

Scraped data printing to terminal, but not saving in CSV file

I am working on a scrapy project to scrape video game product information and reviews from Metacritic. The data I want is located on different pages, and I want to scrape product info into one CSV and reviews into a different CSV. Because of this my code is more complicated than "scrape data, yield item". I need to yield one kind of item (product info), then yield a request to go to the game's review page, and yield the other kind of item (product reviews).

My current code is working, but the data that is scraped prints to the anaconda prompt terminal window, while the CSV files remain empty. All the data is scraped correctly though, because I can see it in my terminal. The issue seems to be how the items are yielded and processed in pipeline.py.

Below is code for items.py, myspider.py, and pipeline.py. The spider code has been editing down significantly to only include the relevant parts, since it's rather long and complex.

items.py:
import scrapy

class GameItem(scrapy.Item):
    url = scrapy.Field()
    title = scrapy.Field()
    platform = scrapy.Field()
    genres = scrapy.Field()
    release_date = scrapy.Field()
    ESRB_rating = scrapy.Field()
    summary = scrapy.Field()
    average_user_score = scrapy.Field()
    metascore = scrapy.Field()
    developer = scrapy.Field()
    publisher = scrapy.Field()

class ReviewItem(scrapy.Item):
    title = scrapy.Field()
    platform = scrapy.Field()
    username = scrapy.Field()
    score = scrapy.Field()
    date = scrapy.Field()
    review_text = scrapy.Field()
    critic_flag = scrapy.Field()

game_spider.py:
from scrapy import Spider, Request
from games.items import GameItem, ReviewItem

class GameSpider(Spider):
    name = 'game_spider'
    allowed_urls = ['https://www.metacritic.com']
    start_urls = ['https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0']

    def parse(self, response):
        page_urls = #scrape all result pages

        for url in page_urls:
            yield Request(url=url, callback=self.parse_game_urls, dont_filter = True)

    def parse_game_urls(self, response):
        game_urls = #scrape each game url from each result page 

        for url in game_urls:
            yield Request(url=url, callback=self.parse_game_page, dont_filter = True)

    def parse_game_page(self, response):

        #scrape game info

        item = GameItem()

        item['url'] = url
        item['title'] = title
        item['platform'] = platform
        item['genres'] = genres
        item['release_date'] = release_date
        item['ESRB_rating'] = ESRB_rating
        item['summary'] = summary
        item['average_user_score'] = average_user_score
        item['metascore'] = metascore
        item['developer'] = developer
        item['publisher'] = publisher

        yield item

        user_review_page = # scrape url to review page
        yield Request(url=user_review_page, callback=self.parse_user_reviews, dont_filter = True)

    def parse_user_reviews(self, response):
        reviews = #scrape all reviews 
        for review in reviews:

            #scrape review info

            item = ReviewItem()

            item['title'] = title
            item['platform'] = platform
            item['username'] = username
            item['score'] = int(score)
            item['date'] = date
            item['review_text'] = review_text
            item['critic_flag'] = 0

            yield item


pipelines.py:
from scrapy.exporters import CsvItemExporter
from scrapy import signals
from pydispatch import dispatcher


class GamesPipeline(object):

    def __init__(self):
        self.fileNamesCsv = ['GameItem','ReviewItem']
        self.files = {} 
        self.exporters = {}
        dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)

    def spider_opened(self, spider):
        self.files = dict([ (name, open(name + '.csv','wb')) for name in self.fileNamesCsv])
        for name in self.fileNamesCsv:
            self.exporters[name] = CsvItemExporter(self.files[name])
            if name == 'GameItem':
                self.exporters[name].fields_to_export = ['url','title','platform','genres','release_date','ESRB_rating','summary',
                'average_user_score','metascore','developer','publisher']
                self.exporters[name].start_exporting()

            if name == 'ReviewItem':
                self.exporters[name].fields_to_export = ['title','platform','username','score','date','review_text','critic_flag']
                self.exporters[name].start_exporting()

    def spider_closed(self, spider):
        [e.finish_exporting() for e in self.exporters.values()]
        [f.close() for f in self.files.values()]

    def process_item(self, item, spider):
        typesItem = type(item)
        if typesItem in set(self.fileNamesCsv):
            self.exporters[typesItem].export_item(item)
        return item

In case it is helpful, this is what the terminal output looks like:

(base) C:\Users\bdbot\Desktop\games>scrapy crawl game_spider
2020-07-07 17:26:03 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: games)
2020-07-07 17:26:03 [scrapy.utils.log] INFO: Versions: lxml 4.3.4.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 18.9.0, Python 3.7.3 (default, Apr 24 2019, 15:29:51) [MSC v.1915 64 bit (AMD64)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1g  21 Apr 2020), cryptography 2.7, Platform Windows-10-10.0.18362-SP0
2020-07-07 17:26:03 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'games', 'DOWNLOAD_DELAY': 2, 'NEWSPIDER_MODULE': 'games.spiders', 'SPIDER_MODULES': ['games.spiders'], 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
2020-07-07 17:26:03 [scrapy.extensions.telnet] INFO: Telnet Password: 51cb3c8116353545
2020-07-07 17:26:03 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2020-07-07 17:26:03 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
 'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-07-07 17:26:03 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
 'scrapy.spidermiddlewares.referer.RefererMiddleware',
 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
 'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-07-07 17:26:03 [scrapy.middleware] INFO: Enabled item pipelines:
['games.pipelines.GamesPipeline']
2020-07-07 17:26:03 [scrapy.core.engine] INFO: Spider opened
2020-07-07 17:26:03 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-07-07 17:26:03 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-07-07 17:26:03 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0> (referer: None)
2020-07-07 17:26:06 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:15 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=129> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:18 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=126> (failed 1 times): 504 Gateway Time-out
2020-07-07 17:26:19 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=125> (failed 1 times): 504 Gateway Time-out
2020-07-07 17:26:22 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=128> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:25 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=127> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:33 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=124> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=123> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:40 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=122> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=121> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=117> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:44 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=120> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:45 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=119> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/game/xbox/burnout-3-takedown> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:48 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.metacritic.com/game/xbox/burnout-3-takedown>
{'ESRB_rating': 'T',
 'average_user_score': 7.6,
 'developer': 'Criterion Games',
 'genres': 'Driving, Racing, Arcade',
 'metascore': 94.0,
 'platform': 'Xbox',
 'publisher': 'EA Games',
 'release_date': 'Sep  7, 2004',
 'summary': 'Burnout 3 challenges you to crash into (and through) busy '
            'intersections, while creating as much damage as possible. You can '
            'battle your way to the front of the pack by taking down rivals '
            'and causing spectacular crashes. For those who thirst for '
            'crashes, the game includes a crash mode that rewards you for '
            'creating massive pileups. With multiplayer gameplay, more than '
            '100 events, and 40 tracks, Burnout 3 provides intense speed and '
            'action.',
 'title': 'Burnout 3: Takedown',
 'url': 'https://www.metacritic.com/game/xbox/burnout-3-takedown'}
Finished Scraping Burnout 3: Takedown
2020-07-07 17:26:50 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/game/playstation-4/assassins-creed-chronicles-india> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=129)
2020-07-07 17:26:50 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.metacritic.com/game/playstation-4/assassins-creed-chronicles-india>

And so on, for each game item and each review item. They all print to the terminal window.

Try printing the absolute paths of your newly created csv files to double-check where they're being created. Here's some pseudo-code:

# pipelines.py file
import os
...
    def spider_opened(self, spider):
        self.files = dict([ (name, open(name + '.csv','wb')) for name in self.fileNamesCsv])
        for name in self.fileNamesCsv:
            print(os.path.realpath(self.files[name].name)) # new
            self.exporters[name] = CsvItemExporter(self.files[name])
...

Rewriting my pipeline.py into two separate classes fixed my problem:

class GamesPipeline(object):

    def __init__(self):
        self.filename = 'games.csv'

    def open_spider(self, spider):
        self.csvfile = open(self.filename, 'wb')
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()

    def process_item(self, item, spider):
        if isinstance(item, GameItem):            
            self.exporter.export_item(item)
        return item

class ReviewsPipeline(object):

    def __init__(self):
        self.filename = 'game_reviews.csv'

    def open_spider(self, spider):
        self.csvfile = open(self.filename, 'wb')
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()

    def process_item(self, item, spider):
        if isinstance(item, ReviewItem):            
            self.exporter.export_item(item)
        return item

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM