簡體   English   中英

Scrapy 圖片不要下載

[英]Scrapy images do not download

爬蟲運行並找到圖像的 url,但由於某種原因它不會下載圖像。 它在終端中打印 Items 的信息,但沒有記錄任何內容。 我已經嘗試了所有可以在 SO 上找到的設置組合,但到目前為止我一直很不走運。 這個刮刀曾經工作它可能鏈接到最新版本 scrapy 的更新

我運行命令scrapy runspider /path/to/myspider.py

版本:

  • scrapy==2.7.1
  • 蟒蛇==3.10.8

設置.py

BOT_NAME = "my_bot"

SPIDER_MODULES = ["my_bot.spiders"]
NEWSPIDER_MODULE = "my_bot.spiders"


# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'ooshot_marketplace (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = True

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
# }

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
    "my_bot.middlewares.OoshotMarketplaceSpiderMiddleware": 543,
}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    "my_bot.middlewares.OoshotMarketplaceDownloaderMiddleware": 543,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
# }

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# ITEM_PIPELINES = {
#    'my_bot.pipelines.MyPipeline': 300,
# }

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
# DUPEFILTER_DEBUG = True
# ITEM_PIPELINES = {"scrapy.pipelines.images.ImagesPipeline": 1}


ITEM_PIPELINES = {"crawler.pipelines.SessionImagesPipeline": 1}

IMAGES_STORE = "images"

IMAGES_URLS_FIELD = "image_urls"  # copy verbatim
IMAGES_RESULT_FIELD = "images"  # copy verbatim

我的蜘蛛.py

import os

import scrapy
import sys


class ImageItem(scrapy.Item):

    # ... other item fields ...
    image_urls = scrapy.Field()
    photographer_name = scrapy.Field()
    category_name = scrapy.Field()


class MySpider(scrapy.Spider):

    name = "myspider"
    start_urls = ["http://my-url/"]

    http_user = "my-user"
    http_pass = "my-passwd"

    def parse(self, response):

        photographers_urls = response.css(".search-result-name a::attr(href)").extract()
        for photographer_url in photographers_urls:
            yield scrapy.Request(
                response.urljoin(photographer_url), callback=self.parse_photographer
            )

        photographers_pages_urls = response.css(".pagination a::attr(href)").extract()
        for photographer_page_url in photographers_pages_urls:
            yield scrapy.Request(
                response.urljoin(photographer_page_url), callback=self.parse
            )

    def parse_photographer(self, response):
        photographer_name = os.path.basename(response.url)
        categories_urls = response.css(
            ".profile-header-categories a::attr(href)"
        ).extract()
        for category_url in categories_urls:
            yield scrapy.Request(
                response.urljoin(category_url),
                callback=self.parse_category,
                meta={"photographer_name": photographer_name},
            )

    def parse_category(self, response):
        category_name = os.path.basename(response.url)
        photos_urls = response.css(".grid-col a::attr(href)").extract()
        for photo_url in photos_urls:
            yield scrapy.Request(
                response.urljoin(photo_url),
                callback=self.save_photo,
                meta={
                    "photographer_name": response.meta["photographer_name"],
                    "category_name": category_name,
                },
            )

    def save_photo(self, response):
        image_url = response.css(".js-photo-details-photo::attr(src)").extract_first()

        image_item = ImageItem()
        image_item["image_urls"] = [response.urljoin(image_url)]
        image_item["photographer_name"] = response.meta["photographer_name"]
        image_item["category_name"] = response.meta["category_name"]
        yield image_item

管道.py

import scrapy
import os
from scrapy.pipelines.images import ImagesPipeline, ImageException

class SessionImagesPipeline(ImagesPipeline):

    # # Photographers function
    def item_completed(self, results, item, info):

        # iterate over the local file paths of all downloaded images
        for result in [x for ok, x in results if ok]:
            path = result["path"]
            # here we create the session-path where the files should be in the end
            # you'll have to change this path creation depending on your needs

            # settings = get_project_settings()
            storage = "/my/path/images"

            category_path = os.path.join(storage, item["category_name"])
            if not os.path.isdir(category_path):
                os.mkdir(category_path)

            photographer_path = os.path.join(category_path, item["photographer_name"])
            if not os.path.isdir(photographer_path):
                os.mkdir(photographer_path)

            target_path = os.path.join(photographer_path, os.path.basename(path))
            path = os.path.join(storage, path)

            # try to move the file and raise exception if not possible

            if not os.rename(path, target_path):
                raise ImageException("Could not move image to target folder")

            # here we'll write out the result with the new path,
            # if there is a result field on the item (just like the original code does)
            if self.IMAGES_RESULT_FIELD in item.fields:
                result["path"] = target_path
                item[self.IMAGES_RESULT_FIELD].append(result)

        return item

您缺少圖像項目中的圖像結果字段。

class ImageItem(scrapy.Item):

    # ... other item fields ...
    image_urls = scrapy.Field()
    photographer_name = scrapy.Field()
    category_name = scrapy.Field()
    images = scrapy.Field()   # <----- add this

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM