[英]Scraped data printing to terminal, but not saving in CSV file
我正在开发一个 scrapy 项目,以从 Metacritic 上抓取视频游戏产品信息和评论。 我想要的数据位于不同的页面上,我想将产品信息刮到一个 CSV 并评论到另一个 CSV。 因此,我的代码比“抓取数据,产出项目”更复杂。 我需要产生一种物品(产品信息),然后向游戏的评论页面发出对 go 的请求,并产生另一种物品(产品评论)。
我当前的代码正在运行,但是被抓取的数据打印到 anaconda 提示终端 window,而 CSV 文件仍然为空。 不过,所有数据都被正确抓取,因为我可以在终端中看到它。 问题似乎是如何在 pipeline.py 中生成和处理项目。
下面是 items.py、myspider.py 和 pipeline.py 的代码。 蜘蛛代码已被大幅编辑,只包含相关部分,因为它相当长且复杂。
items.py:
import scrapy
class GameItem(scrapy.Item):
url = scrapy.Field()
title = scrapy.Field()
platform = scrapy.Field()
genres = scrapy.Field()
release_date = scrapy.Field()
ESRB_rating = scrapy.Field()
summary = scrapy.Field()
average_user_score = scrapy.Field()
metascore = scrapy.Field()
developer = scrapy.Field()
publisher = scrapy.Field()
class ReviewItem(scrapy.Item):
title = scrapy.Field()
platform = scrapy.Field()
username = scrapy.Field()
score = scrapy.Field()
date = scrapy.Field()
review_text = scrapy.Field()
critic_flag = scrapy.Field()
game_spider.py:
from scrapy import Spider, Request
from games.items import GameItem, ReviewItem
class GameSpider(Spider):
name = 'game_spider'
allowed_urls = ['https://www.metacritic.com']
start_urls = ['https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0']
def parse(self, response):
page_urls = #scrape all result pages
for url in page_urls:
yield Request(url=url, callback=self.parse_game_urls, dont_filter = True)
def parse_game_urls(self, response):
game_urls = #scrape each game url from each result page
for url in game_urls:
yield Request(url=url, callback=self.parse_game_page, dont_filter = True)
def parse_game_page(self, response):
#scrape game info
item = GameItem()
item['url'] = url
item['title'] = title
item['platform'] = platform
item['genres'] = genres
item['release_date'] = release_date
item['ESRB_rating'] = ESRB_rating
item['summary'] = summary
item['average_user_score'] = average_user_score
item['metascore'] = metascore
item['developer'] = developer
item['publisher'] = publisher
yield item
user_review_page = # scrape url to review page
yield Request(url=user_review_page, callback=self.parse_user_reviews, dont_filter = True)
def parse_user_reviews(self, response):
reviews = #scrape all reviews
for review in reviews:
#scrape review info
item = ReviewItem()
item['title'] = title
item['platform'] = platform
item['username'] = username
item['score'] = int(score)
item['date'] = date
item['review_text'] = review_text
item['critic_flag'] = 0
yield item
pipelines.py:
from scrapy.exporters import CsvItemExporter
from scrapy import signals
from pydispatch import dispatcher
class GamesPipeline(object):
def __init__(self):
self.fileNamesCsv = ['GameItem','ReviewItem']
self.files = {}
self.exporters = {}
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([ (name, open(name + '.csv','wb')) for name in self.fileNamesCsv])
for name in self.fileNamesCsv:
self.exporters[name] = CsvItemExporter(self.files[name])
if name == 'GameItem':
self.exporters[name].fields_to_export = ['url','title','platform','genres','release_date','ESRB_rating','summary',
'average_user_score','metascore','developer','publisher']
self.exporters[name].start_exporting()
if name == 'ReviewItem':
self.exporters[name].fields_to_export = ['title','platform','username','score','date','review_text','critic_flag']
self.exporters[name].start_exporting()
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
typesItem = type(item)
if typesItem in set(self.fileNamesCsv):
self.exporters[typesItem].export_item(item)
return item
如果有帮助,这就是终端 output 的样子:
(base) C:\Users\bdbot\Desktop\games>scrapy crawl game_spider
2020-07-07 17:26:03 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: games)
2020-07-07 17:26:03 [scrapy.utils.log] INFO: Versions: lxml 4.3.4.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 18.9.0, Python 3.7.3 (default, Apr 24 2019, 15:29:51) [MSC v.1915 64 bit (AMD64)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1g 21 Apr 2020), cryptography 2.7, Platform Windows-10-10.0.18362-SP0
2020-07-07 17:26:03 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'games', 'DOWNLOAD_DELAY': 2, 'NEWSPIDER_MODULE': 'games.spiders', 'SPIDER_MODULES': ['games.spiders'], 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
2020-07-07 17:26:03 [scrapy.extensions.telnet] INFO: Telnet Password: 51cb3c8116353545
2020-07-07 17:26:03 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-07-07 17:26:03 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-07-07 17:26:03 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-07-07 17:26:03 [scrapy.middleware] INFO: Enabled item pipelines:
['games.pipelines.GamesPipeline']
2020-07-07 17:26:03 [scrapy.core.engine] INFO: Spider opened
2020-07-07 17:26:03 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-07-07 17:26:03 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-07-07 17:26:03 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0> (referer: None)
2020-07-07 17:26:06 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:15 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=129> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:18 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=126> (failed 1 times): 504 Gateway Time-out
2020-07-07 17:26:19 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=125> (failed 1 times): 504 Gateway Time-out
2020-07-07 17:26:22 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=128> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:25 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=127> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:33 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=124> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=123> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:40 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=122> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=121> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=117> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:44 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=120> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:45 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=119> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/game/xbox/burnout-3-takedown> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=0)
2020-07-07 17:26:48 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.metacritic.com/game/xbox/burnout-3-takedown>
{'ESRB_rating': 'T',
'average_user_score': 7.6,
'developer': 'Criterion Games',
'genres': 'Driving, Racing, Arcade',
'metascore': 94.0,
'platform': 'Xbox',
'publisher': 'EA Games',
'release_date': 'Sep 7, 2004',
'summary': 'Burnout 3 challenges you to crash into (and through) busy '
'intersections, while creating as much damage as possible. You can '
'battle your way to the front of the pack by taking down rivals '
'and causing spectacular crashes. For those who thirst for '
'crashes, the game includes a crash mode that rewards you for '
'creating massive pileups. With multiplayer gameplay, more than '
'100 events, and 40 tracks, Burnout 3 provides intense speed and '
'action.',
'title': 'Burnout 3: Takedown',
'url': 'https://www.metacritic.com/game/xbox/burnout-3-takedown'}
Finished Scraping Burnout 3: Takedown
2020-07-07 17:26:50 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/game/playstation-4/assassins-creed-chronicles-india> (referer: https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page=129)
2020-07-07 17:26:50 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.metacritic.com/game/playstation-4/assassins-creed-chronicles-india>
依此类推,对于每个游戏项目和每个评论项目。 它们都打印到终端 window。
尝试打印新创建的 csv 文件的绝对路径,以仔细检查它们的创建位置。 这是一些伪代码:
# pipelines.py file
import os
...
def spider_opened(self, spider):
self.files = dict([ (name, open(name + '.csv','wb')) for name in self.fileNamesCsv])
for name in self.fileNamesCsv:
print(os.path.realpath(self.files[name].name)) # new
self.exporters[name] = CsvItemExporter(self.files[name])
...
将我的 pipeline.py 重写为两个单独的类解决了我的问题:
class GamesPipeline(object):
def __init__(self):
self.filename = 'games.csv'
def open_spider(self, spider):
self.csvfile = open(self.filename, 'wb')
self.exporter = CsvItemExporter(self.csvfile)
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.csvfile.close()
def process_item(self, item, spider):
if isinstance(item, GameItem):
self.exporter.export_item(item)
return item
class ReviewsPipeline(object):
def __init__(self):
self.filename = 'game_reviews.csv'
def open_spider(self, spider):
self.csvfile = open(self.filename, 'wb')
self.exporter = CsvItemExporter(self.csvfile)
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.csvfile.close()
def process_item(self, item, spider):
if isinstance(item, ReviewItem):
self.exporter.export_item(item)
return item
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.