簡體   English   中英

沒有調用python連接信號

[英]python connect signal not being called

我有以下文件和代碼

import logging
from scrapy import signals
from scrapy.exceptions import NotConfigured

logger = logging.getLogger(__name__)

class SpiderOpenCloseLogging:

    def __init__(self, item_count):
        self.item_count = item_count
        self.items_scraped = 0

    @classmethod
    def from_crawler(cls, crawler):

        print('Hey I am called')
        # first check if the extension should be enabled and raise
        # NotConfigured otherwise
        # if not crawler.settings.getbool('MYEXT_ENABLED'):
        #     raise NotConfigured

        # get the number of items from settings
        item_count = 1000 #crawler.settings.getint('MYEXT_ITEMCOUNT', 1000)

        # instantiate the extension object
        ext = cls(crawler.settings,crawler.stats)

        # connect the extension object to signals
        crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)

        # return the extension object
        return ext

    def spider_opened(self, spider):
        logger.info("opened spider %s", spider.name)

    def spider_closed(self, spider):
        logger.info("closed spider %s", spider.name)

    def item_scraped(self, item, spider):
        self.items_scraped += 1
        if self.items_scraped % self.item_count == 0:
            logger.info("scraped %d items", self.items_scraped)

我已經更改了設置

MYEXT_ENABLED = True 
EXTENSIONS = {
     'project.custom_extension.SpiderOpenCloseLogging': 300
}

但是沒有信號被調用,我檢查了設置中給出的路徑,正在調用蜘蛛

事件我給出的打印沒有被記錄

有人可以建議我缺少什么

謝謝

所有信號都是從我對您的腳本的改編中調用的。 您犯了一些錯誤,對我來說這沒有任何意義,因為您沒有指定任何具體內容。 這就是為什么您沒有收到信號而是收到錯誤的原因:

幾個錯誤:

1.

    def __init__(self, item_count, stats):
        self.item_count = item_count
        #self.items_scraped = 0 --- change this
        self.items_scraped = stats
    def item_scraped(self, item, spider):
        # self.items_scraped += 1 --- You could do this but then you would not need `crawler.stats`
        # if self.items_scraped % self.item_count == 0: --- these should be the other way around
            logger.info("scraped %d items", self.items_scraped)
#additional note;
#--- you did not substantiate self.item_count, putting item_count
 #in from_crawler does not work. Because you are returning ext, so
 #self.item_count takes crawler.settings rather than item_count. So
 #you will get an error.

一世。 通過更新,我們有以下更正:

 def __init__(self, item_count, stats): # if you want to include crawler.stats
        self.item_count = item_count
        self.items_scraped = stats

ii.

    def spider_opened(self, spider):
        self.items_scraped = self.items_scraped.get_value('item_scraped_count') #use crawler.stats to get item_count
        if self.items_scraped is None:
            self.items_scraped = 0 #then instantiate with 0
        self.item_count = self.item_count.getint('MYEXT_ITEMCOUNT', 1000) #get you item count from settings
        print(f'TEST: {self.items_scraped}, COUNT:{self.item_count}')
        logger.info("opened spider %s", spider.name)

iii.

    def item_scraped(self, item, spider):
        logger.info(f"scraped few {self.items_scraped} items")
        self.items_scraped += 1
        if  self.item_count % self.items_scraped == 0: # these have been flipped
            logger.info(f"scraped increments {self.items_scraped} items")

把這個放在一起的例子:


import logging
from scrapy import signals
import scrapy

logger = logging.getLogger(__name__)

class SpiderOpenCloseLogging(scrapy.Spider):

    name = 'log_signals'

    start_urls =  [f'http://quotes.toscrape.com/page/{i}/' for i in range(1, 11)]

    def __init__(self, item_count, stats):
        self.item_count = item_count
        self.items_scraped = stats
        #self.items_scraped = 0

    @classmethod
    def from_crawler(cls, crawler):
        ext = cls(crawler.settings,crawler.stats)

        crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)

        return ext

    def spider_opened(self, spider):
        self.items_scraped = self.items_scraped.get_value('item_scraped_count')
        if self.items_scraped is None:
            self.items_scraped = 0
        self.item_count = self.item_count.getint('MYEXT_ITEMCOUNT', 1000)
        print(f'TEST: {self.items_scraped}, COUNT:{self.item_count}')
        logger.info("opened spider %s", spider.name)

    def spider_closed(self, spider):
        logger.info("closed spider %s", spider.name)

    def item_scraped(self, item, spider):
        logger.info(f"scraped few {self.items_scraped} items")
        self.items_scraped += 1
        if  self.item_count % self.items_scraped == 0:
            #print(f"scraped increments {self.items_scraped} items")
            logger.info(f"scraped increments {self.items_scraped} items")
    
    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url=url,
                callback=self.parse
            )
    def parse(self, response):
        content = response.xpath('//div[@class = "row"]//div')
        for items in content:
            yield {
                'some_items_links':items.xpath(".//a//@href").get()
            }

輸出:

.
.
.
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 194 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
{'some_items_links': '/author/C-S-Lewis'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 195 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
{'some_items_links': '/tag/christianity/page/1/'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 196 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
{'some_items_links': '/tag/love/'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 197 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
{'some_items_links': '/author/J-K-Rowling'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 198 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
{'some_items_links': '/author/J-K-Rowling'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 199 items
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped increments 200 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
{'some_items_links': '/tag/truth/page/1/'}
...

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM