Can somebody help me out in understanding the response status code capture for every crawl request made my Scrapy Spider. I was able to get the output for Resp code 200 but if the website has 404 error it's not writing any output including 301 and 302 also.
Here is the code that i have implemented for someother website, and added a domain name containing my name for reference.
import scrapy
import requests
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.spiders import Rule, CrawlSpider
class TestSpider(CrawlSpider):
name = 'TestSpider'
handle_httpstatus_list = [404]
resp_log_file = 'C:\\resp'
ok_log_file = 'C:\\alright'
bad_log_file = 'C:\\badresp'
redirect_log_file = 'C:\\redirect'
allowed_domains = ['santhosh.com']
start_urls = ['santhosh.com/']
# This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method
rules = [
Rule(
LinkExtractor(
canonicalize=True,
unique=True
),
follow=True,
callback="parse_item"
)
]
def parse_item(self, response):
# The list of items that are found on the particular page
items = []
res = Selector(response)
self.append(self.resp_log_file, str(response))
# Only extract canonicalized and unique links (with respect to the current page)
links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)
try:
if response.status == 404:
## 404 tracciate anche separatamente
self.append(self.bad_log_file, response.url)
elif response.status == 200:
## printa su ok_log_file
self.append(self.ok_log_file, response.url)
elif response.status == 302:
## printa su ok_log_file
self.append(self.redirect_log_file, response.url)
else:
self.append(self.bad_log_file, response.url)
except Exception, e:
pass
return None
def append(self, file, string):
print " Writing content to File "
file = open(file, 'a')
file.write(string+"\n")
file.close()
I have seen questions related to response code capture but they were not exactly similar to my requests and hence created this new post. If there is any question available already related to this i request you to ignore this and redirect me over there. Thanks in advance!
I tried code and I see it sends 404
and 301
to parse()
, not to parse_item()
but I don't have page with broken links so it doesn't start LinkExtractor
I used portal httpbin.org to generate pages with different status.
Maybe if I have page with broken urls then LinkExtractor
could run and I could get different results.
#!/usr/bin/env python3
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.spiders import Rule, CrawlSpider
#from scrapy.commands.view import open_in_browser
class MySpider(CrawlSpider):
name = 'MySpider'
handle_httpstatus_list = [404, 301, 302, 303]
all_responses_log = './responses_all.log'
ok_responses_log = './responses_ok.log'
bad_responses_log = './responses_bad.log'
redirects_responses_log = './responses_redirect.log'
start_urls = [
'http://httpbin.org/status/301',
'http://httpbin.org/status/302',
'http://httpbin.org/status/303',
'http://httpbin.org/status/404',
'http://httpbin.org/status/200',
]
# This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method
rules = [
Rule(
LinkExtractor(
canonicalize=True,
unique=True
),
follow=True,
callback="parse_item"
)
]
def parse(self, response):
print('parse url:', response.url)
self.test_status('parse()', response)
def parse_item(self, response):
print('parse item url:', response.url)
self.test_status('parse_item()', response)
# The list of items that are found on the particular page
items = []
res = Selector(response)
self.append(self.resp_log_file, str(response))
# Only extract canonicalized and unique links (with respect to the current page)
links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)
def test_status(self, text, response):
try:
if response.status == 404:
log = self.bad_responses_log
elif response.status == 200:
log = self.ok_responses_log
#elif 299 < response.status < 400:
elif response.status in (301, 302, 303, 307):
log = self.redirects_responses_log
else:
log = self.redirects_responses_log
message = "{} | {} | {}\n".format(response.status, text, response.url)
self.append(log, message)
except Exception as e:
print('Error:', e)
def append(self, filename, string):
print('Writing log:', filename)
with open(filename, 'a') as f:
f.write(string)
# --- it runs without project and saves in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
})
c.crawl(MySpider)
c.start()
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.