[英]python class: run functions step by step and save them
我有一個讀取數據幀的類,然后是另一個處理該數據幀的類。 處理類中的函數應逐步應用於同一數據幀,以形成最終數據幀,然后將其保存為 csv 文件。
from pydantic import BaseModel
from config import DATA_REPO
import pandas as pd
import os
class PandaDataFrame(BaseModel):
data: pd.DataFrame
class Config:
arbitrary_types_allowed = True
class Directory(BaseModel):
data_directory: str
class DataToPandaReader(object):
def csv_file_reader(self, directory: Directory):
directory = directory.data_directory
for file in os.listdir(directory):
if file.endswith('.csv'):
return pd.read_csv(os.path.join(directory, file))
class DataProcessor(object):
def remove_punctuation(self, my_: PandaDataFrame):
my_data_to_process = my_.data
for col in my_data_to_process:
if any(word in col for word in ['example', 'text', 'Answer']):
my_data_to_process = my_data_to_process[col].str.replace('[^\w\s]', '', regex=True)
return add_number_column(my_data_to_process)
def add_number_column(self, my_: PandaDataFrame):
my_data_to_process = my_.data
my_data_to_process['sentence_number'] = range(len(my_data_to_process))
return save_final_dataframe(my_data_to_process)
def save_final_dataframe(self, my_:PandaDataFrame):
my_data_to_process = my_.data
return my_data_to_process.to_csv('final_data.csv')
def parse_data_process(directory_to_csv_file):
toprocess = DataProcessor()
toprocess.save_final_dataframe(directory_to_csv_file)
toprocess.remove_punctuation(directory_to_csv_file)
toprocess.add_number_column(directory_to_csv_file)
return toprocess
if __name__ == '__main__':
parse_data_process(PandaDataFrame(data= DataToPandaReader().csv_file_reader(Directory(data_directory = os.path.join(DATA_REPO, 'input_data')))))
現在,例如要實例化 DataProcessor 類中的第一個函數,我將執行以下操作
DataProcessor().remove_punctuation(PandaDataFrame(data= DataToPandaReader().csv_file_reader(Directory(data_directory = os.path.join(DATA_REPO, 'input_data')))))
但我的目的是在 DataProcessor 類中逐步運行所有這些函數,因此 save_final_dataset 函數將保存已刪除標點符號並且還有一個數字列的數據幀。
更新:
按照給出的答案,我進行了這些更改,但得到函數未知的錯誤。
def parse_data_process(directory_to_csv_file):
toprocess = DataProcessor()
toprocess.save_final_dataframe(directory_to_csv_file)
toprocess.remove_punctuation(directory_to_csv_file)
toprocess.add_number_column(directory_to_csv_file)
return toprocess
if __name__ == '__main__':
parse_data_process(PandaDataFrame(data= DataToPandaReader().csv_file_reader(Directory(data_directory = os.path.join(DATA_REPO, 'input_data')))))
除非我誤解了您的用例,否則您需要做的就是替換
return my_data_to_process
...在 remove_punctuation 函數中
return add_number_column(my_data_to_process)
...然后更換
return my_data_to_process
...在 add_number_column 函數中
return save_final_dataframe(my_data_to_process)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.