[英]python connect signal not being called
我有以下文件和代碼
import logging
from scrapy import signals
from scrapy.exceptions import NotConfigured
logger = logging.getLogger(__name__)
class SpiderOpenCloseLogging:
def __init__(self, item_count):
self.item_count = item_count
self.items_scraped = 0
@classmethod
def from_crawler(cls, crawler):
print('Hey I am called')
# first check if the extension should be enabled and raise
# NotConfigured otherwise
# if not crawler.settings.getbool('MYEXT_ENABLED'):
# raise NotConfigured
# get the number of items from settings
item_count = 1000 #crawler.settings.getint('MYEXT_ITEMCOUNT', 1000)
# instantiate the extension object
ext = cls(crawler.settings,crawler.stats)
# connect the extension object to signals
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)
# return the extension object
return ext
def spider_opened(self, spider):
logger.info("opened spider %s", spider.name)
def spider_closed(self, spider):
logger.info("closed spider %s", spider.name)
def item_scraped(self, item, spider):
self.items_scraped += 1
if self.items_scraped % self.item_count == 0:
logger.info("scraped %d items", self.items_scraped)
我已經更改了設置
MYEXT_ENABLED = True
EXTENSIONS = {
'project.custom_extension.SpiderOpenCloseLogging': 300
}
但是沒有信號被調用,我檢查了設置中給出的路徑,正在調用蜘蛛
事件我給出的打印沒有被記錄
有人可以建議我缺少什么
謝謝
所有信號都是從我對您的腳本的改編中調用的。 您犯了一些錯誤,對我來說這沒有任何意義,因為您沒有指定任何具體內容。 這就是為什么您沒有收到信號而是收到錯誤的原因:
幾個錯誤:
1.
def __init__(self, item_count, stats):
self.item_count = item_count
#self.items_scraped = 0 --- change this
self.items_scraped = stats
def item_scraped(self, item, spider):
# self.items_scraped += 1 --- You could do this but then you would not need `crawler.stats`
# if self.items_scraped % self.item_count == 0: --- these should be the other way around
logger.info("scraped %d items", self.items_scraped)
#additional note;
#--- you did not substantiate self.item_count, putting item_count
#in from_crawler does not work. Because you are returning ext, so
#self.item_count takes crawler.settings rather than item_count. So
#you will get an error.
一世。 通過更新,我們有以下更正:
def __init__(self, item_count, stats): # if you want to include crawler.stats
self.item_count = item_count
self.items_scraped = stats
ii.
def spider_opened(self, spider):
self.items_scraped = self.items_scraped.get_value('item_scraped_count') #use crawler.stats to get item_count
if self.items_scraped is None:
self.items_scraped = 0 #then instantiate with 0
self.item_count = self.item_count.getint('MYEXT_ITEMCOUNT', 1000) #get you item count from settings
print(f'TEST: {self.items_scraped}, COUNT:{self.item_count}')
logger.info("opened spider %s", spider.name)
iii.
def item_scraped(self, item, spider):
logger.info(f"scraped few {self.items_scraped} items")
self.items_scraped += 1
if self.item_count % self.items_scraped == 0: # these have been flipped
logger.info(f"scraped increments {self.items_scraped} items")
把這個放在一起的例子:
import logging
from scrapy import signals
import scrapy
logger = logging.getLogger(__name__)
class SpiderOpenCloseLogging(scrapy.Spider):
name = 'log_signals'
start_urls = [f'http://quotes.toscrape.com/page/{i}/' for i in range(1, 11)]
def __init__(self, item_count, stats):
self.item_count = item_count
self.items_scraped = stats
#self.items_scraped = 0
@classmethod
def from_crawler(cls, crawler):
ext = cls(crawler.settings,crawler.stats)
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)
return ext
def spider_opened(self, spider):
self.items_scraped = self.items_scraped.get_value('item_scraped_count')
if self.items_scraped is None:
self.items_scraped = 0
self.item_count = self.item_count.getint('MYEXT_ITEMCOUNT', 1000)
print(f'TEST: {self.items_scraped}, COUNT:{self.item_count}')
logger.info("opened spider %s", spider.name)
def spider_closed(self, spider):
logger.info("closed spider %s", spider.name)
def item_scraped(self, item, spider):
logger.info(f"scraped few {self.items_scraped} items")
self.items_scraped += 1
if self.item_count % self.items_scraped == 0:
#print(f"scraped increments {self.items_scraped} items")
logger.info(f"scraped increments {self.items_scraped} items")
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse
)
def parse(self, response):
content = response.xpath('//div[@class = "row"]//div')
for items in content:
yield {
'some_items_links':items.xpath(".//a//@href").get()
}
輸出:
.
.
.
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 194 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
{'some_items_links': '/author/C-S-Lewis'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 195 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
{'some_items_links': '/tag/christianity/page/1/'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 196 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
{'some_items_links': '/tag/love/'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 197 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
{'some_items_links': '/author/J-K-Rowling'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 198 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
{'some_items_links': '/author/J-K-Rowling'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 199 items
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped increments 200 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
{'some_items_links': '/tag/truth/page/1/'}
...
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.