简体   繁体   中英

Scrapy spider exporting CSV file in JSON format

I created a GUI app with Scrapy Spider but when I save the data as CSV, it doesn't export in the right format. It exports the data in JSON format. What could be the reason and fix for this? You can see the output in the screenshot below.

在此处输入图像描述

The complete project is here: https://drive.google.com/file/d/1Ztgqi6-dLH6YHJBo-e9R5rwvWdCGOJhD/view?usp=sharing

GUI App Code is Below. It's dynamic so it will work with any scrapy project:

Please select "CSV" from the DropDown:

from tkinter import *
from tkinter import messagebox
from tkinter import filedialog
from scrapy.utils import project
from scrapy import spiderloader
from scrapy.utils.log import configure_logging
from scrapy.crawler import CrawlerRunner
from twisted.internet import reactor
import threading


def get_spiders():
    settings = project.get_project_settings()
    spider_loader = spiderloader.SpiderLoader.from_settings(settings)
    return spider_loader.list()

def get_chosen_spider(value):
    global chosen_spider
    chosen_spider = value
    return chosen_spider

def get_chosen_feed(value):
    global chosen_feed
    chosen_feed = value
    return chosen_feed


def browse_button():
    global folder_path
    folder_path = filedialog.askdirectory()
    folder_path_entry.delete(0, END)
    folder_path_entry.insert(0, folder_path)
    return folder_path

def execute_spider():
    if dataset_entry.get() == '' or chosen_feed not in ['CSV', 'JSON']:
        messagebox.showerror('Error', 'All entries are required')
        return
    
    try:
        feed_uri = f"file:///{folder_path}/{dataset_entry.get()}.{chosen_feed}"
    except:
        messagebox.showerror('Error', 'All entries all required')
    
    settings = project.get_project_settings()
    settings.set('FEED_URI', feed_uri)
    settings.set('FEED_TYPE', chosen_feed)

    configure_logging()
    runner = CrawlerRunner(settings)
    runner.crawl(chosen_spider)
    
    reactor.run(installSignalHandlers=False)

def start_execute_thread(event):
    global execute_thread
    execute_thread = threading.Thread(target=execute_spider, daemon=True)
    execute_thread.start()
    app.after(10, check_execute_thread)

def check_execute_thread():
    if execute_thread.is_alive():
        app.after(10, check_execute_thread)



app = Tk()

#Spiders list
spider_label = Label(app, text='Choose a spider')
spider_label.grid(row=0 , column=0, sticky=W, pady=10, padx=10)

spider_text = StringVar(app)
spider_text.set('Choose a spider')
spiders = [spider for spider in get_spiders()]

spiders_dropdown = OptionMenu(app, spider_text, *spiders, command=get_chosen_spider)
spiders_dropdown.grid(row=0, column=1, columnspan=2)

# Feed Type
feed_label = Label(app, text='Choose a feed')
feed_label.grid(row=1 , column=0, sticky=W, pady=10, padx=10)

feed_text = StringVar(app)
feed_text.set('Choose a feed')
feeds = ['JSON', 'CSV']

feed_dropdown = OptionMenu(app, feed_text, *feeds, command=get_chosen_feed)
feed_dropdown.grid(row=1, column=1, columnspan=2)

# Path Entry
folder_path_text = StringVar(app)
folder_path_entry = Entry(app, textvariable=folder_path_text)
folder_path_entry.grid(row=2, column=0, pady=10, padx=10)

# Dataset Entry
dataset_text = StringVar(app)
dataset_entry = Entry(app, textvariable=dataset_text, width=10)
dataset_entry.grid(row=2, column=1, pady=10, padx=10)

browse_btn = Button(app, text='Browse', command=browse_button)
browse_btn.grid(row=2, column=2)

#update this one too
execute_btn = Button(app, text='Execute', command=lambda: start_execute_thread(None))
execute_btn.grid(row=3, column=0, columnspan=3)

app.title('Spider Executer')
app.geometry('300x200')
app.resizable(False, False)
app.mainloop()

Please change this line:

settings.set('FEED_TYPE', chosen_feed)

to:

settings.set('FEED_FORMAT', chosen_feed)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM