python class: run functions step by step and save them

Question

I have a class that reads a dataframe and then another class which processes that dataframe. the functions in the processing class should be applied on the same dataframe step by step to shape the final dataframe which is then saved as a csv file.

from pydantic import BaseModel
from config import DATA_REPO
import pandas as pd
import os

class PandaDataFrame(BaseModel):
data: pd.DataFrame

      class Config:
          arbitrary_types_allowed = True

class Directory(BaseModel):
    data_directory: str

class DataToPandaReader(object):

    def csv_file_reader(self, directory: Directory):
        directory = directory.data_directory
        for file in os.listdir(directory):
            if file.endswith('.csv'):
               return pd.read_csv(os.path.join(directory, file))

class DataProcessor(object):

    def remove_punctuation(self, my_: PandaDataFrame):
        my_data_to_process = my_.data
        for col in my_data_to_process:
            if any(word in col for word in ['example', 'text', 'Answer']):
                my_data_to_process = my_data_to_process[col].str.replace('[^\w\s]', '', regex=True)
                return add_number_column(my_data_to_process)

    def add_number_column(self, my_: PandaDataFrame):
        my_data_to_process = my_.data
        my_data_to_process['sentence_number'] = range(len(my_data_to_process))
        return save_final_dataframe(my_data_to_process)

    def save_final_dataframe(self, my_:PandaDataFrame):
        my_data_to_process = my_.data
        return my_data_to_process.to_csv('final_data.csv')

def parse_data_process(directory_to_csv_file):
    toprocess = DataProcessor()
    toprocess.save_final_dataframe(directory_to_csv_file)
    toprocess.remove_punctuation(directory_to_csv_file)
    toprocess.add_number_column(directory_to_csv_file)
    return toprocess

if __name__ == '__main__':
    parse_data_process(PandaDataFrame(data= DataToPandaReader().csv_file_reader(Directory(data_directory = os.path.join(DATA_REPO, 'input_data')))))

now, for example to instantiate the first function in DataProcessor class, I would do the following

DataProcessor().remove_punctuation(PandaDataFrame(data= DataToPandaReader().csv_file_reader(Directory(data_directory = os.path.join(DATA_REPO, 'input_data')))))

but my intention is to run all these function in the DataProcessor class step by step, so the save_final_dataset function would save the dataframe that is has its punctuation removed and also has a number column.

update:

following the answer given, I made these changes, but get the error that the functions are not known.

def parse_data_process(directory_to_csv_file):
    toprocess = DataProcessor()
    toprocess.save_final_dataframe(directory_to_csv_file)
    toprocess.remove_punctuation(directory_to_csv_file)
    toprocess.add_number_column(directory_to_csv_file)
    return toprocess

if __name__ == '__main__':
    parse_data_process(PandaDataFrame(data= DataToPandaReader().csv_file_reader(Directory(data_directory = os.path.join(DATA_REPO, 'input_data')))))

Answer 1

Unless I've misunderstood your use-case, all you need to do is replace

return my_data_to_process

...in the remove_punctuation function with

return add_number_column(my_data_to_process)

...then replace

return my_data_to_process

...in the add_number_column function with

return save_final_dataframe(my_data_to_process)

python class: run functions step by step and save them

Question

1 answers

solution1
1 ACCPTED 2021-07-21 14:58:46

python class: run functions step by step and save them

Question

1 answers

solution1 1 ACCPTED 2021-07-21 14:58:46

solution1
1 ACCPTED 2021-07-21 14:58:46