[英]scrapy_redis Stop my spiders after x time of IDLE
我有一個scrapy_redis蜘蛛池,可以監聽redis隊列(蜘蛛的數量並不總是相同)。 此隊列由另一個腳本提供。 我希望我的蜘蛛在X分鍾不活動后停止,當redis隊列中沒有任何東西時。
我在settings.py中設置了SCHEDULER_IDLE_BEFORE_CLOSE,但它似乎不起作用。
這是我的settings.py :
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER_IDLE_BEFORE_CLOSE = 10
REDIS_HOST = 'localhost'
DOWNLOADER_MIDDLEWARES = {
'serp_crawl.middlewares.RandomUserAgentMiddleware': 200,
'scrapy_crawlera.CrawleraMiddleware': 300
}
CRAWLERA_ENABLED = True
CRAWLERA_USER = ''
CRAWLERA_PASS = ''
#Activate Crawlera User Agent
DEFAULT_REQUEST_HEADERS = {
"X-Crawlera-UA": "pass",
}
UPDATE
這是我的蜘蛛代碼:
from scrapy_redis.spiders import RedisSpider
from elasticsearch import Elasticsearch
from serp_crawl.settings import *
from datetime import datetime
from redis import Redis
import scrapy
import json
class SerpSpider(RedisSpider):
name = "serpcrawler"
redis_key = 'serp_crawler:request'
def __init__(self, redis_host='localhost', redis_port='6379',
elasticsearch_host='localhost', elasticsearch_port='9200',
mysql_host='localhost', dev=False,):
super(SerpSpider, self).__init__()
self.platform = None
self.dev = bool(dev)
self.q = Redis(redis_host, redis_port)
self.es = Elasticsearch([{'host': elasticsearch_host, 'port': elasticsearch_port}])
@classmethod
def from_crawler(self, crawler, *args, **kwargs):
crawler.settings.attributes['REDIS_HOST'].value = kwargs['redis_host']
obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs)
obj.setup_redis(crawler)
return obj
def make_requests_from_url(self, url):
data = json.loads(url)
self.logger.info('Got new url to parse: ', data['url'])
self.settings.attributes['DEFAULT_REQUEST_HEADERS'].value.attributes['X-Crawlera-UA'].value = data['platform']
self.platform = data['platform']
return scrapy.Request(url=data['url'], callback=self.parse,
meta={'keyword': data['keyword'],
'id': data['id_keyword'],
'country': data['country'],
'platform': data['platform']}, dont_filter=True)
def parse(self, response):
doc = dict()
try:
doc['content'] = response.body.decode('cp1252')
except:
doc['content'] = response.body
doc['date'] = datetime.now().strftime('%Y-%m-%d')
doc['keyword'] = str(response.meta['keyword'])
doc['type_platform'] = str(response.meta['platform'])
doc['country'] = str(response.meta['country'])
if not self.dev:
id_index = self.es.index(index='serp_html', doc_type='page', body=doc)
self.q.lpush('batching_serp',
{'id_index': str(id_index['_id']),
'type_batching': 'default',
'country': doc['country'],
'type_platform': doc['type_platform'],
'keyword': doc['keyword'],
'id_keyword': int(response.meta['id'])})
self.logger.info('Indexed new page. id_es : [' + str(id_index['_id']) + ']')
謝謝你的幫助。
scrapy-redis
文檔說:
# Max idle time to prevent the spider from being closed when distributed crawling.
# This only works if queue class is SpiderQueue or SpiderStack,
# and may also block the same time when your spider start at the first time (because the queue is empty).
SCHEDULER_IDLE_BEFORE_CLOSE = 10
因此,您需要設置以下任一設置:
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue'
# or
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue'
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.