I try to instantiate multiple spiders. The first one works fine, but the second one gives me an error: ReactorNotRestartable.
feeds = {
'nasa': {
'name': 'nasa',
'url': 'https://www.nasa.gov/rss/dyn/breaking_news.rss',
'start_urls': ['https://www.nasa.gov/rss/dyn/breaking_news.rss']
},
'xkcd': {
'name': 'xkcd',
'url': 'http://xkcd.com/rss.xml',
'start_urls': ['http://xkcd.com/rss.xml']
}
}
With the items above, I try to run two spiders in a loop, like this:
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import XMLFeedSpider
class MySpider(XMLFeedSpider):
name = None
def __init__(self, **kwargs):
this_feed = feeds[self.name]
self.start_urls = this_feed.get('start_urls')
self.iterator = 'iternodes'
self.itertag = 'items'
super(MySpider, self).__init__(**kwargs)
def parse_node(self, response, node):
pass
def start_crawler():
process = CrawlerProcess({
'USER_AGENT': CONFIG['USER_AGENT'],
'DOWNLOAD_HANDLERS': {'s3': None} # boto issues
})
for feed_name in feeds.keys():
MySpider.name = feed_name
process.crawl(MySpider)
process.start()
The exception for the second loop looks like this, spider opened, but then:
...
2015-11-22 00:00:00 [scrapy] INFO: Spider opened
2015-11-22 00:00:00 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2015-11-22 00:00:00 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2015-11-21 23:54:05 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
Traceback (most recent call last):
File "env/bin/start_crawler", line 9, in <module>
load_entry_point('feed-crawler==0.0.1', 'console_scripts', 'start_crawler')()
File "/Users/bling/py-feeds-crawler/feed_crawler/crawl.py", line 51, in start_crawler
process.start() # the script will block here until the crawling is finished
File "/Users/bling/py-feeds-crawler/env/lib/python2.7/site-packages/scrapy/crawler.py", line 251, in start
reactor.run(installSignalHandlers=False) # blocking call
File "/usr/local/lib/python2.7/site-packages/twisted/internet/base.py", line 1193, in run
self.startRunning(installSignalHandlers=installSignalHandlers)
File "/usr/local/lib/python2.7/site-packages/twisted/internet/base.py", line 1173, in startRunning
ReactorBase.startRunning(self)
File "/usr/local/lib/python2.7/site-packages/twisted/internet/base.py", line 684, in startRunning
raise error.ReactorNotRestartable()
twisted.internet.error.ReactorNotRestartable
Do I have to invalidate the first MySpider or what am I doing wrong and need to change that it works. Thanks in advance.
looks like you have to instantiate a process per spider, try:
def start_crawler():
for feed_name in feeds.keys():
process = CrawlerProcess({
'USER_AGENT': CONFIG['USER_AGENT'],
'DOWNLOAD_HANDLERS': {'s3': None} # boto issues
})
MySpider.name = feed_name
process.crawl(MySpider)
process.start()
Solution is to collect the spiders in the loop and start process just once at the end. My guess, it has something to do with the Reactor allocation / deallocation.
def start_crawler():
process = CrawlerProcess({
'USER_AGENT': CONFIG['USER_AGENT'],
'DOWNLOAD_HANDLERS': {'s3': None} # disable for issues with boto
})
for feed_name in CONFIG['Feeds'].keys():
MySpider.name = feed_name
process.crawl(MySpider)
process.start()
Thanks @eLRuLL for your answer it inspired me to find this solution.
You can send params in the crawl and use them in parsing process.
class MySpider(XMLFeedSpider):
def __init__(self, name, **kwargs):
super(MySpider, self).__init__(**kwargs)
self.name = name
def start_crawler():
process = CrawlerProcess({
'USER_AGENT': CONFIG['USER_AGENT'],
'DOWNLOAD_HANDLERS': {'s3': None} # boto issues
})
for feed_name in feeds.keys():
process.crawl(MySpider, feed_name)
process.start()
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.