[英]Scrapy spider exporting CSV file in JSON format
I created a GUI app with Scrapy Spider but when I save the data as CSV, it doesn't export in the right format.我使用 Scrapy Spider 创建了一个 GUI 应用程序,但是当我将数据保存为 CSV 时,它不会以正确的格式导出。 It exports the data in JSON format.
它以 JSON 格式导出数据。 What could be the reason and fix for this?
这可能是什么原因和解决方法? You can see the output in the screenshot below.
您可以在下面的屏幕截图中看到 output。
The complete project is here: https://drive.google.com/file/d/1Ztgqi6-dLH6YHJBo-e9R5rwvWdCGOJhD/view?usp=sharing完整的项目在这里: https://drive.google.com/file/d/1Ztgqi6-dLH6YHJBo-e9R5rwvWdCGOJhD/view?usp=sharing
GUI App Code is Below. GUI 应用程序代码如下。 It's dynamic so it will work with any scrapy project:
它是动态的,因此可以与任何 scrapy 项目一起使用:
Please select "CSV" from the DropDown:请从下拉菜单 select "CSV":
from tkinter import *
from tkinter import messagebox
from tkinter import filedialog
from scrapy.utils import project
from scrapy import spiderloader
from scrapy.utils.log import configure_logging
from scrapy.crawler import CrawlerRunner
from twisted.internet import reactor
import threading
def get_spiders():
settings = project.get_project_settings()
spider_loader = spiderloader.SpiderLoader.from_settings(settings)
return spider_loader.list()
def get_chosen_spider(value):
global chosen_spider
chosen_spider = value
return chosen_spider
def get_chosen_feed(value):
global chosen_feed
chosen_feed = value
return chosen_feed
def browse_button():
global folder_path
folder_path = filedialog.askdirectory()
folder_path_entry.delete(0, END)
folder_path_entry.insert(0, folder_path)
return folder_path
def execute_spider():
if dataset_entry.get() == '' or chosen_feed not in ['CSV', 'JSON']:
messagebox.showerror('Error', 'All entries are required')
return
try:
feed_uri = f"file:///{folder_path}/{dataset_entry.get()}.{chosen_feed}"
except:
messagebox.showerror('Error', 'All entries all required')
settings = project.get_project_settings()
settings.set('FEED_URI', feed_uri)
settings.set('FEED_TYPE', chosen_feed)
configure_logging()
runner = CrawlerRunner(settings)
runner.crawl(chosen_spider)
reactor.run(installSignalHandlers=False)
def start_execute_thread(event):
global execute_thread
execute_thread = threading.Thread(target=execute_spider, daemon=True)
execute_thread.start()
app.after(10, check_execute_thread)
def check_execute_thread():
if execute_thread.is_alive():
app.after(10, check_execute_thread)
app = Tk()
#Spiders list
spider_label = Label(app, text='Choose a spider')
spider_label.grid(row=0 , column=0, sticky=W, pady=10, padx=10)
spider_text = StringVar(app)
spider_text.set('Choose a spider')
spiders = [spider for spider in get_spiders()]
spiders_dropdown = OptionMenu(app, spider_text, *spiders, command=get_chosen_spider)
spiders_dropdown.grid(row=0, column=1, columnspan=2)
# Feed Type
feed_label = Label(app, text='Choose a feed')
feed_label.grid(row=1 , column=0, sticky=W, pady=10, padx=10)
feed_text = StringVar(app)
feed_text.set('Choose a feed')
feeds = ['JSON', 'CSV']
feed_dropdown = OptionMenu(app, feed_text, *feeds, command=get_chosen_feed)
feed_dropdown.grid(row=1, column=1, columnspan=2)
# Path Entry
folder_path_text = StringVar(app)
folder_path_entry = Entry(app, textvariable=folder_path_text)
folder_path_entry.grid(row=2, column=0, pady=10, padx=10)
# Dataset Entry
dataset_text = StringVar(app)
dataset_entry = Entry(app, textvariable=dataset_text, width=10)
dataset_entry.grid(row=2, column=1, pady=10, padx=10)
browse_btn = Button(app, text='Browse', command=browse_button)
browse_btn.grid(row=2, column=2)
#update this one too
execute_btn = Button(app, text='Execute', command=lambda: start_execute_thread(None))
execute_btn.grid(row=3, column=0, columnspan=3)
app.title('Spider Executer')
app.geometry('300x200')
app.resizable(False, False)
app.mainloop()
Please change this line:请更改此行:
settings.set('FEED_TYPE', chosen_feed)
to:至:
settings.set('FEED_FORMAT', chosen_feed)
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.