繁体   English   中英

Scrapy 蜘蛛在广泛爬行中未产生所有 start_requests 网址

[英]Scrapy spider not yielding all start_requests urls in broad crawl

我正在尝试创建一个抓取器,用于从 >300.000 start_urls 抓取主页和一些更深的页面。 代码运行没有重大错误,但仅在 31.000 个 url 后停止。 scrapy 日志显示“finished_reason”=“finished”。

**我不明白 Scrapy 完成的原因,同时并非所有来自 start_requests 的网址都被抓取。 **

下面的代码显示了我运行的蜘蛛。

import scrapy
from scrapy import Request
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
from datetime import datetime
from twisted.internet.error import DNSLookupError, TCPTimedOutError, ConnectionLost
from scrapy.spidermiddlewares.httperror import HttpError
import pandas as pd

# To run: scrapy crawl gptspider -s -o output.json --logfile logfile.txt


class TextSpider(scrapy.Spider):
    name = "gptspider"

    # Settings to optimize for our broad crawl, as recommended by https://docs.scrapy.org/en/latest/topics/broad-crawls.html
    custom_settings = {
        "SCHEDULER_PRIORITY_QUEUE": "scrapy.pqueues.DownloaderAwarePriorityQueue",
        "CONCURRENT_REQUESTS": 100,
        "REACTOR_THREADPOOL_MAXSIZE": 20,
        "USER_AGENT": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
    }

    def __init__(self, *a, **kw):
        super(TextSpider, self).__init__(*a, **kw)
        self.progress_counter = 0
        self.MAX_DEPTH = 0
        self.companies_with_valid_url = []
        self.allowed_domains = []
        urlfilepath = "PATH_TO_CSV_WITH_OVER_100k_ULRS"
        companyurls = pd.read_csv(urlfilepath)
        for i, row in companyurls.iterrows():
            some_property = row["Some_propoerty"]
            url = row["website"]
            if url is not None and type(url) == str:
                base_url = TextSpider.convert_url_to_base(url)
                self.companies_with_valid_url.append(
                    {"some_property": some_property, "base_url": base_url}
                )
                self.allowed_domains.append(urlparse(base_url).netloc)

    # Put URLS into the right format
    @staticmethod
    def convert_url_to_base(url):
        if url is not None and type(url) == str:
            if not (
                url.startswith("//")
                or url.startswith("http://")
                or url.startswith("https://")
            ):
                url = "//" + url
            url = urlparse(url).netloc
            if ("http" in url) & (url[-1:] == "/"):
                return url
            elif ("http" in url) & (url[-1:] != "/"):
                return url + "/"
            elif ("http" not in url) & (url[-1:] == "/"):
                return "http://" + url
            else:
                return "http://" + url + "/"
        return url

    def start_requests(self):
        print(
            f"Starting requests for {len(self.companies_with_valid_url)} URLS"
        )  # Output Starting requests for 320833 companies
        for companyurl in self.companies_with_valid_url:
            yield scrapy.Request(
                url=companyurl["base_url"],
                callback=self.parse,
                errback=self.handle_error,
                dont_filter=True,
                meta={"some_property": companyurl["some_property"]},
            )
        print("THIS NEVER PRINTS")

    # Parse the response, extract the visible text and scrape the subpages
    def parse(self, response):
        if self.progress_counter % 100 == 0:
            print(self.progress_counter)
            print(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
        self.progress_counter += 1

        # THIS IS NOT YET USED
        if response.meta["depth"] < self.MAX_DEPTH:
            # Get a list of all the sub-pages to scrape
            sub_pages = response.xpath("//a/@href").getall()
            # Yield requests to scrape the sub-pages
            for sub_page in sub_pages:
                yield response.follow(
                    sub_page,
                    callback=self.parse,
                    errback=self.handle_error,
                    dont_filter=True,
                    meta={"some_property": response.request.meta["some_property"]},
                )

        # Yield the scraped text and the URL it came from
        return self.response_to_data(response)

    def handle_error(self, failure):

        if failure.check(HttpError):
            error_status = failure.value.response.status

        elif failure.check(DNSLookupError):
            error_status = "DNSLookupError"

        elif failure.check(TimeoutError, TCPTimedOutError):
            error_status = "TCPTimedOutError"
        elif failure.check(ConnectionLost):
            error_status = "ConnectionLost"
        else:
            error_status = "Other error"

        # add depth variable if not included (can happen under some error circumstances)
        if "depth" not in failure.request.meta:
            failure.request.meta["depth"] = 0

        return {
            "some_property": failure.request.meta["some_property"],
            "url": failure.request.url,
            "depth": failure.request.meta["depth"],
            "response_code": error_status,
            "scraped_at": datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
            "text": failure.getErrorMessage(),
            "failure": 1,
        }

    def response_to_data(self, response):
        return {
            "some_property": response.request.meta["some_property"],
            "url": response.url,
            "depth": response.request.meta["depth"],
            "response_code": response.status,
            "scraped_at": datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
            "text": self.extract_visible_text(response),
            "failure": 0,
        }

    # Extract the visible text from a scrapy HttpResponse
    def extract_visible_text(self, response):
        soup = BeautifulSoup(response.text, "html.parser")

        if response.status == 200 or response.status == 304:
            soup = BeautifulSoup(response.text, "html.parser")
            # Removes all script and style tags
            for script in soup(["style", "script"]):
                script.decompose()

            # If no "body" tag in the html text, the firms will be classified as unlabeled
            if soup.find("body") == None:
                return "NODATA_NO_BODY_TAG"
            else:
                html = soup.get_text()
                html = " ".join(html.split())
                visible_text = re.sub(r"(?<![A-Z])(?<!^)([A-Z])", r" \1", html)
                # visible_text = html
                # Handle surrogates
                visible_text = visible_text.encode("utf-8", "surrogateescape").decode(
                    "utf-8", "replace"
                )
                return visible_text
        else:
            return {"text": "NODATA_BAD_RESPONSE_CODE"}

我尝试调整设置(如 Spider 代码中所示)以符合 Scrapy 对广泛爬行的建议,但没有成功。

我还尝试检查最后的日志以查看程序完成之前发生了什么。 我确实注意到在完成之前有一个“用户超时导致连接失败”错误(请参阅下面的日志摘录),但这些发生在日志中并且据我所知不应导致程序完成。


...

Getting http://www.okaidi.be/ took longer than 180.0 seconds..', 'failure': 1}
2023-01-04 12:09:48 [scrapy.core.engine] INFO: Closing spider (finished)
2023-01-04 12:09:48 [scrapy.extensions.feedexport] INFO: Stored json feed (3040 items) in: output11.json
2023-01-04 12:09:48 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
...

编辑:

日志文件的摘要(结尾)

2023-01-04 17:52:34 [scrapy.core.engine] INFO: Closing spider (finished)
2023-01-04 17:52:34 [scrapy.extensions.feedexport] INFO: Stored json feed (34754 items) in: output33.json
2023-01-04 17:52:34 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/exception_count': 9531,
 'downloader/exception_type_count/idna.core.InvalidCodepoint': 1,
 'downloader/exception_type_count/scrapy.exceptions.IgnoreRequest': 355,
 'downloader/exception_type_count/twisted.internet.error.ConnectionRefusedError': 49,
 'downloader/exception_type_count/twisted.internet.error.DNSLookupError': 6360,
 'downloader/exception_type_count/twisted.internet.error.TCPTimedOutError': 1139,
 'downloader/exception_type_count/twisted.internet.error.TimeoutError': 20,
 'downloader/exception_type_count/twisted.web._newclient.ResponseFailed': 24,
 'downloader/exception_type_count/twisted.web._newclient.ResponseNeverReceived': 1583,
 'downloader/request_bytes': 33338047,
 'downloader/request_count': 138042,
 'downloader/request_method_count/GET': 138042,
 'downloader/response_bytes': 995846155,
 'downloader/response_count': 128866,
 'downloader/response_status_count/200': 59652,
 'downloader/response_status_count/202': 1,
 'downloader/response_status_count/301': 48480,
 'downloader/response_status_count/302': 7107,
 'downloader/response_status_count/303': 210,
 'downloader/response_status_count/307': 176,
 'downloader/response_status_count/308': 464,
 'downloader/response_status_count/400': 29,
 'downloader/response_status_count/401': 11,
 'downloader/response_status_count/402': 5,
 'downloader/response_status_count/403': 1260,
 'downloader/response_status_count/404': 8688,
 'downloader/response_status_count/405': 1,
 'downloader/response_status_count/406': 2,
 'downloader/response_status_count/409': 2,
 'downloader/response_status_count/410': 56,
 'downloader/response_status_count/418': 2,
 'downloader/response_status_count/429': 156,
 'downloader/response_status_count/456': 6,
 'downloader/response_status_count/500': 1709,
 'downloader/response_status_count/502': 47,
 'downloader/response_status_count/503': 654,
 'downloader/response_status_count/504': 27,
 'downloader/response_status_count/510': 2,
 'downloader/response_status_count/520': 4,
 'downloader/response_status_count/521': 7,
 'downloader/response_status_count/522': 87,
 'downloader/response_status_count/523': 2,
 'downloader/response_status_count/524': 3,
 'downloader/response_status_count/526': 2,
 'downloader/response_status_count/999': 14,
 'elapsed_time_seconds': 2787.423471,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 1, 4, 16, 52, 34, 847868),
 'httpcompression/response_bytes': 3578050995,
 'httpcompression/response_count': 53590,
 'item_scraped_count': 34754,
 'log_count/DEBUG': 684335,
 'log_count/ERROR': 5348,
 'log_count/INFO': 57,
 'log_count/WARNING': 426,
 'request_depth_max': 1,
 'response_received_count': 70239,
 'retry/count': 7823,
 'retry/max_reached': 3814,
 'retry/reason_count/429 Unknown Status': 118,
 'retry/reason_count/500 Internal Server Error': 994,
 'retry/reason_count/502 Bad Gateway': 32,
 'retry/reason_count/503 Service Unavailable': 436,
 'retry/reason_count/504 Gateway Time-out': 19,
 'retry/reason_count/522 Unknown Status': 58,
 'retry/reason_count/524 Unknown Status': 2,
 'retry/reason_count/twisted.internet.error.ConnectionRefusedError': 31,
 'retry/reason_count/twisted.internet.error.DNSLookupError': 4240,
 'retry/reason_count/twisted.internet.error.TCPTimedOutError': 804,
 'retry/reason_count/twisted.internet.error.TimeoutError': 14,
 'retry/reason_count/twisted.web._newclient.ResponseFailed': 16,
 'retry/reason_count/twisted.web._newclient.ResponseNeverReceived': 1059,
 "robotstxt/exception_count/<class 'idna.core.InvalidCodepoint'>": 1,
 "robotstxt/exception_count/<class 'twisted.internet.error.ConnectionRefusedError'>": 9,
 "robotstxt/exception_count/<class 'twisted.internet.error.DNSLookupError'>": 1094,
 "robotstxt/exception_count/<class 'twisted.internet.error.TCPTimedOutError'>": 163,
 "robotstxt/exception_count/<class 'twisted.internet.error.TimeoutError'>": 3,
 "robotstxt/exception_count/<class 'twisted.web._newclient.ResponseFailed'>": 2,
 "robotstxt/exception_count/<class 'twisted.web._newclient.ResponseNeverReceived'>": 261,
 'robotstxt/forbidden': 355,
 'robotstxt/request_count': 38926,
 'robotstxt/response_count': 37332,
 'robotstxt/response_status_count/200': 28193,
 'robotstxt/response_status_count/400': 16,
 'robotstxt/response_status_count/401': 6,
 'robotstxt/response_status_count/402': 3,
 'robotstxt/response_status_count/403': 618,
 'robotstxt/response_status_count/404': 8084,
 'robotstxt/response_status_count/405': 1,
 'robotstxt/response_status_count/406': 1,
 'robotstxt/response_status_count/409': 1,
 'robotstxt/response_status_count/410': 28,
 'robotstxt/response_status_count/418': 1,
 'robotstxt/response_status_count/429': 19,
 'robotstxt/response_status_count/500': 242,
 'robotstxt/response_status_count/502': 6,
 'robotstxt/response_status_count/503': 86,
 'robotstxt/response_status_count/504': 2,
 'robotstxt/response_status_count/510': 1,
 'robotstxt/response_status_count/520': 2,
 'robotstxt/response_status_count/521': 3,
 'robotstxt/response_status_count/522': 13,
 'robotstxt/response_status_count/523': 1,
 'robotstxt/response_status_count/526': 1,
 'robotstxt/response_status_count/999': 4,
 'scheduler/dequeued': 72075,
 'scheduler/dequeued/memory': 72075,
 'scheduler/enqueued': 72075,
 'scheduler/enqueued/memory': 72075,
 'spider_exceptions/ValueError': 1,
 'start_time': datetime.datetime(2023, 1, 4, 16, 6, 7, 424397)}
2023-01-04 17:52:34 [scrapy.core.engine] INFO: Spider closed (finished)

我遵循了Scrapy 上的建议,限制了来自@granitosaurus 的 start_url ,并按照建议调整了代码以分批抓取。 这似乎奏效了。 该程序现在会抓取所有网址。

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM