简体   繁体   中英

Scrapy images do not download

The scraper runs and finds the urls of the images, but it won't download the images for some reason. It prints the information of the Items in the terminal, but nothing gets recorded. I have tried all the combinations of settings I could find on SO, but I have been unlucky so far. This scraper used to work it might be linked to the updates on the recent version of scrapy

I run the command scrapy runspider /path/to/myspider.py

Versions:

  • scrapy==2.7.1
  • python==3.10.8

settings.py

BOT_NAME = "my_bot"

SPIDER_MODULES = ["my_bot.spiders"]
NEWSPIDER_MODULE = "my_bot.spiders"


# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'ooshot_marketplace (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = True

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
# }

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
    "my_bot.middlewares.OoshotMarketplaceSpiderMiddleware": 543,
}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    "my_bot.middlewares.OoshotMarketplaceDownloaderMiddleware": 543,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
# }

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# ITEM_PIPELINES = {
#    'my_bot.pipelines.MyPipeline': 300,
# }

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
# DUPEFILTER_DEBUG = True
# ITEM_PIPELINES = {"scrapy.pipelines.images.ImagesPipeline": 1}


ITEM_PIPELINES = {"crawler.pipelines.SessionImagesPipeline": 1}

IMAGES_STORE = "images"

IMAGES_URLS_FIELD = "image_urls"  # copy verbatim
IMAGES_RESULT_FIELD = "images"  # copy verbatim

my_spider.py

import os

import scrapy
import sys


class ImageItem(scrapy.Item):

    # ... other item fields ...
    image_urls = scrapy.Field()
    photographer_name = scrapy.Field()
    category_name = scrapy.Field()


class MySpider(scrapy.Spider):

    name = "myspider"
    start_urls = ["http://my-url/"]

    http_user = "my-user"
    http_pass = "my-passwd"

    def parse(self, response):

        photographers_urls = response.css(".search-result-name a::attr(href)").extract()
        for photographer_url in photographers_urls:
            yield scrapy.Request(
                response.urljoin(photographer_url), callback=self.parse_photographer
            )

        photographers_pages_urls = response.css(".pagination a::attr(href)").extract()
        for photographer_page_url in photographers_pages_urls:
            yield scrapy.Request(
                response.urljoin(photographer_page_url), callback=self.parse
            )

    def parse_photographer(self, response):
        photographer_name = os.path.basename(response.url)
        categories_urls = response.css(
            ".profile-header-categories a::attr(href)"
        ).extract()
        for category_url in categories_urls:
            yield scrapy.Request(
                response.urljoin(category_url),
                callback=self.parse_category,
                meta={"photographer_name": photographer_name},
            )

    def parse_category(self, response):
        category_name = os.path.basename(response.url)
        photos_urls = response.css(".grid-col a::attr(href)").extract()
        for photo_url in photos_urls:
            yield scrapy.Request(
                response.urljoin(photo_url),
                callback=self.save_photo,
                meta={
                    "photographer_name": response.meta["photographer_name"],
                    "category_name": category_name,
                },
            )

    def save_photo(self, response):
        image_url = response.css(".js-photo-details-photo::attr(src)").extract_first()

        image_item = ImageItem()
        image_item["image_urls"] = [response.urljoin(image_url)]
        image_item["photographer_name"] = response.meta["photographer_name"]
        image_item["category_name"] = response.meta["category_name"]
        yield image_item

pipeline.py

import scrapy
import os
from scrapy.pipelines.images import ImagesPipeline, ImageException

class SessionImagesPipeline(ImagesPipeline):

    # # Photographers function
    def item_completed(self, results, item, info):

        # iterate over the local file paths of all downloaded images
        for result in [x for ok, x in results if ok]:
            path = result["path"]
            # here we create the session-path where the files should be in the end
            # you'll have to change this path creation depending on your needs

            # settings = get_project_settings()
            storage = "/my/path/images"

            category_path = os.path.join(storage, item["category_name"])
            if not os.path.isdir(category_path):
                os.mkdir(category_path)

            photographer_path = os.path.join(category_path, item["photographer_name"])
            if not os.path.isdir(photographer_path):
                os.mkdir(photographer_path)

            target_path = os.path.join(photographer_path, os.path.basename(path))
            path = os.path.join(storage, path)

            # try to move the file and raise exception if not possible

            if not os.rename(path, target_path):
                raise ImageException("Could not move image to target folder")

            # here we'll write out the result with the new path,
            # if there is a result field on the item (just like the original code does)
            if self.IMAGES_RESULT_FIELD in item.fields:
                result["path"] = target_path
                item[self.IMAGES_RESULT_FIELD].append(result)

        return item

You are missing the images result field in your image item.

class ImageItem(scrapy.Item):

    # ... other item fields ...
    image_urls = scrapy.Field()
    photographer_name = scrapy.Field()
    category_name = scrapy.Field()
    images = scrapy.Field()   # <----- add this

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM