简体   繁体   中英

Crawling Websites and capture different response status codes using Python Scrapy

Can somebody help me out in understanding the response status code capture for every crawl request made my Scrapy Spider. I was able to get the output for Resp code 200 but if the website has 404 error it's not writing any output including 301 and 302 also.

Here is the code that i have implemented for someother website, and added a domain name containing my name for reference.

import scrapy
import requests
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.spiders import Rule, CrawlSpider



class TestSpider(CrawlSpider):
    name = 'TestSpider' 
    handle_httpstatus_list = [404]
    resp_log_file = 'C:\\resp'
    ok_log_file = 'C:\\alright'
    bad_log_file = 'C:\\badresp'
    redirect_log_file = 'C:\\redirect'

    allowed_domains = ['santhosh.com']
    start_urls = ['santhosh.com/']

    # This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method
    rules = [
        Rule(
            LinkExtractor(
                canonicalize=True,
                unique=True
            ),
            follow=True,
            callback="parse_item"
        )
    ]

    def parse_item(self, response):
        # The list of items that are found on the particular page
        items = []
        res = Selector(response)
        self.append(self.resp_log_file, str(response))
        # Only extract canonicalized and unique links (with respect to the current page)
        links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)
        try:
            if response.status == 404:
                ## 404 tracciate anche separatamente
                self.append(self.bad_log_file, response.url)
            elif response.status == 200:
                ## printa su ok_log_file
                self.append(self.ok_log_file, response.url)
            elif response.status == 302:
                ## printa su ok_log_file
                self.append(self.redirect_log_file, response.url)
            else:
                self.append(self.bad_log_file, response.url)
        except Exception, e:
            pass

        return None


    def append(self, file, string):
        print " Writing content to File "
        file = open(file, 'a')
        file.write(string+"\n")
        file.close()

I have seen questions related to response code capture but they were not exactly similar to my requests and hence created this new post. If there is any question available already related to this i request you to ignore this and redirect me over there. Thanks in advance!

I tried code and I see it sends 404 and 301 to parse() , not to parse_item() but I don't have page with broken links so it doesn't start LinkExtractor

I used portal httpbin.org to generate pages with different status.

Maybe if I have page with broken urls then LinkExtractor could run and I could get different results.

#!/usr/bin/env python3

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.spiders import Rule, CrawlSpider
#from scrapy.commands.view import open_in_browser

class MySpider(CrawlSpider):

    name = 'MySpider' 

    handle_httpstatus_list = [404, 301, 302, 303]

    all_responses_log = './responses_all.log'
    ok_responses_log  = './responses_ok.log'
    bad_responses_log = './responses_bad.log'
    redirects_responses_log = './responses_redirect.log'

    start_urls = [
        'http://httpbin.org/status/301',
        'http://httpbin.org/status/302',
        'http://httpbin.org/status/303',

        'http://httpbin.org/status/404',
        'http://httpbin.org/status/200',
    ]

    # This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method
    rules = [
        Rule(
            LinkExtractor(
                canonicalize=True,
                unique=True
            ),
            follow=True,
            callback="parse_item"
        )
    ]

    def parse(self, response):
        print('parse url:', response.url)

        self.test_status('parse()', response)

    def parse_item(self, response):
        print('parse item url:', response.url)

        self.test_status('parse_item()', response)

        # The list of items that are found on the particular page
        items = []
        res = Selector(response)
        self.append(self.resp_log_file, str(response))
        # Only extract canonicalized and unique links (with respect to the current page)
        links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)

    def test_status(self, text, response):
        try:
            if response.status == 404:
                log = self.bad_responses_log
            elif response.status == 200:
                log = self.ok_responses_log
            #elif 299 < response.status < 400:
            elif response.status in (301, 302, 303, 307):
                log = self.redirects_responses_log
            else:
                log = self.redirects_responses_log

            message = "{} | {} | {}\n".format(response.status, text, response.url)
            self.append(log, message)
        except Exception as e:
            print('Error:', e)

    def append(self, filename, string):
        print('Writing log:', filename)
        with open(filename, 'a') as f:
            f.write(string)


# --- it runs without project and saves in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0',
})
c.crawl(MySpider)
c.start()

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM