Scrapy dynamic feed export

Question

I'm trying to have Scrapy set the export file based on the input file. This is the code I'm trying:

import scrapy


class VideosSpider(scrapy.Spider):
    name = "videos"
    start_urls = []
    custom_settings = {
        "ITEM_PIPELINES": {
            "project.pipelines.VideosPipeline": 200,
            "project.pipelines.EpisodeImagesPipeline": 300,
        },
        "FILES_STORE": "episodes",
        "IMAGES_STORE": "episodes",
    }

    def __init__(self, urls_file=None, *args, **kwargs):
        # urls_file is a list of links (one per line)
        if urls_file:
            with open(urls_file, "r") as f:
                next(f)  # skip first line (header)
                self.start_urls = [url.strip() for url in f.readlines()]

            self.custom_settings["FEEDS"] = {
                Path(urls_file).with_suffix(".txt"): {
                    "format": "custom",
                },
            } # ideally would set the feeds with names corresponding to input file

The above however results an empty FEEDS setting in the pipeline:

class VideosPipeline:
    def __init__(self, uri, files):
        self.uri = uri
        self.files = files

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            uri=crawler.settings.get("FEEDS"), # this is empty
            files=crawler.settings.get("FILES_STORE"),
        )

    def open_spider(self, spider):
        feeds = [k for k, v in self.uri.items() if k["format"] == "cistom"]

        self.exporter = CustomExporter(feeds[0])

And a small piece of code from the custom exporter:

from scrapy.exporters import BaseItemExporter


class CustomExporter(BaseItemExporter):
    def __init__(self, file, **kwargs):
        super().__init__(dont_fail=True, **kwargs)
        self.file = file

Where am I going wrong?

Thank you for the help.

Answer 1

You can use URI parameters in the FEED setting to dynamically set the expoted file path. It enables you to use any spider attribute to compose the FEED value.

See this part from the docs

Any other named parameter gets replaced by the spider attribute of the same name. For example, %(site_id)s would get replaced by the spider.site_id attribute the moment the feed is being created. https://docs.scrapy.org/en/latest/topics/feed-exports.html#storage-uri-parameters

So, on your example, it would be something like (note that I also changed the file extension, since txt is not in the standard serialization formats:

class VideosSpider(scrapy.Spider):
    name = "videos"
    start_urls = []
    custom_settings = {
        "ITEM_PIPELINES": {
            "project.pipelines.VideosPipeline": 200,
            "project.pipelines.EpisodeImagesPipeline": 300,
        },
        "FILES_STORE": "episodes",
        "IMAGES_STORE": "episodes",
        "FEED": {"file:///somepath/%(exported_file)s" : {"format": "csv"}}
    }

    def __init__(self, urls_file=None, *args, **kwargs):
        self.exported_file = str(Path(urls_file).with_suffix(".csv"))

Scrapy dynamic feed export

Question

1 answers

solution1
2 ACCPTED 2021-10-05 14:46:24

Scrapy dynamic feed export

Question

1 answers

solution1 2 ACCPTED 2021-10-05 14:46:24

solution1
2 ACCPTED 2021-10-05 14:46:24