简体   繁体   English

Scrapy 图片不要下载

[英]Scrapy images do not download

The scraper runs and finds the urls of the images, but it won't download the images for some reason.爬虫运行并找到图像的 url,但由于某种原因它不会下载图像。 It prints the information of the Items in the terminal, but nothing gets recorded.它在终端中打印 Items 的信息,但没有记录任何内容。 I have tried all the combinations of settings I could find on SO, but I have been unlucky so far.我已经尝试了所有可以在 SO 上找到的设置组合,但到目前为止我一直很不走运。 This scraper used to work it might be linked to the updates on the recent version of scrapy这个刮刀曾经工作它可能链接到最新版本 scrapy 的更新

I run the command scrapy runspider /path/to/myspider.py我运行命令scrapy runspider /path/to/myspider.py

Versions:版本:

  • scrapy==2.7.1 scrapy==2.7.1
  • python==3.10.8蟒蛇==3.10.8

settings.py设置.py

BOT_NAME = "my_bot"

SPIDER_MODULES = ["my_bot.spiders"]
NEWSPIDER_MODULE = "my_bot.spiders"


# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'ooshot_marketplace (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = True

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
# }

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
    "my_bot.middlewares.OoshotMarketplaceSpiderMiddleware": 543,
}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    "my_bot.middlewares.OoshotMarketplaceDownloaderMiddleware": 543,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
# }

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# ITEM_PIPELINES = {
#    'my_bot.pipelines.MyPipeline': 300,
# }

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
# DUPEFILTER_DEBUG = True
# ITEM_PIPELINES = {"scrapy.pipelines.images.ImagesPipeline": 1}


ITEM_PIPELINES = {"crawler.pipelines.SessionImagesPipeline": 1}

IMAGES_STORE = "images"

IMAGES_URLS_FIELD = "image_urls"  # copy verbatim
IMAGES_RESULT_FIELD = "images"  # copy verbatim

my_spider.py我的蜘蛛.py

import os

import scrapy
import sys


class ImageItem(scrapy.Item):

    # ... other item fields ...
    image_urls = scrapy.Field()
    photographer_name = scrapy.Field()
    category_name = scrapy.Field()


class MySpider(scrapy.Spider):

    name = "myspider"
    start_urls = ["http://my-url/"]

    http_user = "my-user"
    http_pass = "my-passwd"

    def parse(self, response):

        photographers_urls = response.css(".search-result-name a::attr(href)").extract()
        for photographer_url in photographers_urls:
            yield scrapy.Request(
                response.urljoin(photographer_url), callback=self.parse_photographer
            )

        photographers_pages_urls = response.css(".pagination a::attr(href)").extract()
        for photographer_page_url in photographers_pages_urls:
            yield scrapy.Request(
                response.urljoin(photographer_page_url), callback=self.parse
            )

    def parse_photographer(self, response):
        photographer_name = os.path.basename(response.url)
        categories_urls = response.css(
            ".profile-header-categories a::attr(href)"
        ).extract()
        for category_url in categories_urls:
            yield scrapy.Request(
                response.urljoin(category_url),
                callback=self.parse_category,
                meta={"photographer_name": photographer_name},
            )

    def parse_category(self, response):
        category_name = os.path.basename(response.url)
        photos_urls = response.css(".grid-col a::attr(href)").extract()
        for photo_url in photos_urls:
            yield scrapy.Request(
                response.urljoin(photo_url),
                callback=self.save_photo,
                meta={
                    "photographer_name": response.meta["photographer_name"],
                    "category_name": category_name,
                },
            )

    def save_photo(self, response):
        image_url = response.css(".js-photo-details-photo::attr(src)").extract_first()

        image_item = ImageItem()
        image_item["image_urls"] = [response.urljoin(image_url)]
        image_item["photographer_name"] = response.meta["photographer_name"]
        image_item["category_name"] = response.meta["category_name"]
        yield image_item

pipeline.py管道.py

import scrapy
import os
from scrapy.pipelines.images import ImagesPipeline, ImageException

class SessionImagesPipeline(ImagesPipeline):

    # # Photographers function
    def item_completed(self, results, item, info):

        # iterate over the local file paths of all downloaded images
        for result in [x for ok, x in results if ok]:
            path = result["path"]
            # here we create the session-path where the files should be in the end
            # you'll have to change this path creation depending on your needs

            # settings = get_project_settings()
            storage = "/my/path/images"

            category_path = os.path.join(storage, item["category_name"])
            if not os.path.isdir(category_path):
                os.mkdir(category_path)

            photographer_path = os.path.join(category_path, item["photographer_name"])
            if not os.path.isdir(photographer_path):
                os.mkdir(photographer_path)

            target_path = os.path.join(photographer_path, os.path.basename(path))
            path = os.path.join(storage, path)

            # try to move the file and raise exception if not possible

            if not os.rename(path, target_path):
                raise ImageException("Could not move image to target folder")

            # here we'll write out the result with the new path,
            # if there is a result field on the item (just like the original code does)
            if self.IMAGES_RESULT_FIELD in item.fields:
                result["path"] = target_path
                item[self.IMAGES_RESULT_FIELD].append(result)

        return item

You are missing the images result field in your image item.您缺少图像项目中的图像结果字段。

class ImageItem(scrapy.Item):

    # ... other item fields ...
    image_urls = scrapy.Field()
    photographer_name = scrapy.Field()
    category_name = scrapy.Field()
    images = scrapy.Field()   # <----- add this

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM