简体   繁体   中英

How can I print error and other counts in Scrapy as integer variable?

I am trying to print my scrapy log outputs and write them to my db.

Spider

class NigdeBotSpider(scrapy.Spider):

    name = 'nigdehal' #Bot name for scrapy
    allowed_domains = ['www.halfiyatlari.net'] #Main Domain
    start_urls = ['https://www.halfiyatlari.net/nigde-hal-fiyatlari'] #The url that we have scrape in same website

    def parse(self, response):
        location = "nigdehal"
        url = response.url
        urls_ = url.split("/")
        url_tag = Urls.objects.get(url_tag=urls_[3])
        spiderobj = Spiders.objects.get(spider_name=location)

        date = str(response.xpath('//div[@class="col-lg-12"]/span/text()').extract()) #Parse the date seperately
        last_date = dateFixer(date)
        day = int(last_date[0])
        month = int(last_date[1])
        year = int(last_date[2])

        market_date = Markets.objects.get(spider=spiderobj)

        if day==market_date.market_day and month==market_date.market_month and year==market_date.market_year: 
            raise CloseSpider('Dates are same!')
        
        i = 2 #Table name and header passed
        
        path_lenght = len(response.xpath('//*[@class="table-responsive"]//tr'))
        
        for products in response.xpath('//*[@class="table-responsive"]//tr'): #it will parse product by product with for loop
        
            if path_lenght==1:
                break
            
            product_name = wordFixer(products.xpath('//tr['+str(i)+']/td[1]/text()').get())
            high_price = priceFixer(products.xpath('//tr['+str(i)+']/td[3]/text()').get())
            low_price = priceFixer(products.xpath('//tr['+str(i)+']/td[2]/text()').get())   

            if not product_name: continue
            try:
                market = Markets.objects.get(market_name=location)
            except:
                market = Markets(market_name=location, spider=spiderobj)
                market.save()
                
            new_data = Products(
            product_name=wordFixer(product_name),
            price=priceFixer(str(low_price)),
            low_price=priceFixer(str(low_price)),
            high_price=priceFixer(str(high_price)),
            product_URL=url,
            market = market,
            day = day,
            month = month,
            year = year,
            )
            
            market.market_day = day
            market.market_month = month
            market.market_year = year
            market.save()
            new_data.save()
            i+=1
            path_lenght-=1
        url_tag.url_status = response.status
        url_tag.save()

Start crawler

def startSpider(spider_name,spider_class):
    name = 'first_bot.first_bot.spiders.'+spider_name
    now = datetime.datetime.today()
    now_time = now.strftime("%d-%m-%y")
    i = importlib.import_module(name)
    class_ = getattr(i, spider_class)
    configure_logging(install_root_handler=False)

    logging.basicConfig(
        filename='scrapy-log-'+now_time+'.txt',
        format='%(levelname)s: %(message)s',
        level=logging.INFO
    )
    try:
        runner = CrawlerRunner(get_project_settings())
        runner.crawl(class_)
    except OSError as e:
        print("Failed with:" + e.strerror) 
        print("Error code:" + e) 

I just want to add something under my spider or my start script to print my error count and crawl count as the following log.

'downloader/request_bytes': 498,
 'downloader/request_count': 2,
 'downloader/request_method_count/GET': 2,
 'downloader/response_bytes': 8728,
 'downloader/response_count': 2,
 'downloader/response_status_count/200': 1,
 'downloader/response_status_count/301': 1,
 'elapsed_time_seconds': 12.462258,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2020, 9, 29, 10, 34, 20, 855907),
 'log_count/DEBUG': 2,
 'log_count/ERROR': 1,
 'log_count/INFO': 10,

I have Scrapy 2.3.0 and python3. I don't know which library and which logger to use. What is your suggestion?

To find out how much errors does occur during running time, a way is set a error_count as a error variable in stats an increasing it for every occurrence, so you need a parse() method in try except block and increasing it every time does except block runs:

def parse(self, response)
    # set error_count as stat with initial value of 0
    self.crawler.stats.set_value('error_count', 0)
    try:
        # try block action
    except:
        # except block action
        stats.inc_value('error_count')  # increase value every time get an error

log would be:

'error_count': n,
'downloader/request_bytes': 498,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
...

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM