I am trying to print my scrapy log outputs and write them to my db.
Spider
class NigdeBotSpider(scrapy.Spider):
name = 'nigdehal' #Bot name for scrapy
allowed_domains = ['www.halfiyatlari.net'] #Main Domain
start_urls = ['https://www.halfiyatlari.net/nigde-hal-fiyatlari'] #The url that we have scrape in same website
def parse(self, response):
location = "nigdehal"
url = response.url
urls_ = url.split("/")
url_tag = Urls.objects.get(url_tag=urls_[3])
spiderobj = Spiders.objects.get(spider_name=location)
date = str(response.xpath('//div[@class="col-lg-12"]/span/text()').extract()) #Parse the date seperately
last_date = dateFixer(date)
day = int(last_date[0])
month = int(last_date[1])
year = int(last_date[2])
market_date = Markets.objects.get(spider=spiderobj)
if day==market_date.market_day and month==market_date.market_month and year==market_date.market_year:
raise CloseSpider('Dates are same!')
i = 2 #Table name and header passed
path_lenght = len(response.xpath('//*[@class="table-responsive"]//tr'))
for products in response.xpath('//*[@class="table-responsive"]//tr'): #it will parse product by product with for loop
if path_lenght==1:
break
product_name = wordFixer(products.xpath('//tr['+str(i)+']/td[1]/text()').get())
high_price = priceFixer(products.xpath('//tr['+str(i)+']/td[3]/text()').get())
low_price = priceFixer(products.xpath('//tr['+str(i)+']/td[2]/text()').get())
if not product_name: continue
try:
market = Markets.objects.get(market_name=location)
except:
market = Markets(market_name=location, spider=spiderobj)
market.save()
new_data = Products(
product_name=wordFixer(product_name),
price=priceFixer(str(low_price)),
low_price=priceFixer(str(low_price)),
high_price=priceFixer(str(high_price)),
product_URL=url,
market = market,
day = day,
month = month,
year = year,
)
market.market_day = day
market.market_month = month
market.market_year = year
market.save()
new_data.save()
i+=1
path_lenght-=1
url_tag.url_status = response.status
url_tag.save()
Start crawler
def startSpider(spider_name,spider_class):
name = 'first_bot.first_bot.spiders.'+spider_name
now = datetime.datetime.today()
now_time = now.strftime("%d-%m-%y")
i = importlib.import_module(name)
class_ = getattr(i, spider_class)
configure_logging(install_root_handler=False)
logging.basicConfig(
filename='scrapy-log-'+now_time+'.txt',
format='%(levelname)s: %(message)s',
level=logging.INFO
)
try:
runner = CrawlerRunner(get_project_settings())
runner.crawl(class_)
except OSError as e:
print("Failed with:" + e.strerror)
print("Error code:" + e)
I just want to add something under my spider
or my start
script to print my error count and crawl count as the following log.
'downloader/request_bytes': 498,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 8728,
'downloader/response_count': 2,
'downloader/response_status_count/200': 1,
'downloader/response_status_count/301': 1,
'elapsed_time_seconds': 12.462258,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 9, 29, 10, 34, 20, 855907),
'log_count/DEBUG': 2,
'log_count/ERROR': 1,
'log_count/INFO': 10,
I have Scrapy 2.3.0 and python3. I don't know which library and which logger to use. What is your suggestion?
To find out how much errors does occur during running time, a way is set a error_count
as a error variable in stats an increasing it for every occurrence, so you need a parse()
method in try
except
block and increasing it every time does except block runs:
def parse(self, response)
# set error_count as stat with initial value of 0
self.crawler.stats.set_value('error_count', 0)
try:
# try block action
except:
# except block action
stats.inc_value('error_count') # increase value every time get an error
log would be:
'error_count': n,
'downloader/request_bytes': 498,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
...
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.