How can I parallize my Python program to clean a dataset with 2000 csv files using Pandas dataframes?

Question

This is the code for processing one file.

import pandas as pd
import numpy as npimport pandas as pd
inputfile = open('dataset1.csv', 'r')
df = pd.read_csv("cleaning/semicleaned1.csv", sep=",", 
names["press", "gph", "temp","wspd"])
df = df.drop(['gph'], axis=1)
df.to_csv("cleaning/FINAL.csv", sep=',', index=False)

Answer 1

To process all CSV files in your cleaning folder, you could use to glob to get you a list. I would suggest creating an output filename based on your input filename, eg prepending cleaned_ to the filename:

import pandas as pd
import glob
import os


for csv_filename in glob.glob('cleaning/*.csv'):
    # Create a suitable output filename based on the input filename
    split = list(os.path.split(csv_filename))
    name, ext = os.path.splitext(split[-1])
    split[-1] = 'cleaned_{}{}'.format(name, ext)
    cleaned_filename = os.path.join(*split)
    print('{} -> {}'.format(csv_filename, cleaned_filename))

    # Process CSV
    df = pd.read_csv(csv_filename, names=["press", "gph", "temp", "wspd"])
    df = df.drop(['gph'], axis=1)
    df.to_csv(cleaned_filename, sep=',', index=False)

Note, care will be needed if your files have headers.

You could use a multiprocessing.Pool() to run this in parallel:

from multiprocessing import Pool
import pandas as pd
import glob
import os


def clean_csv(csv_filename):
    # Create a suitable output filename based on the input filename
    split = list(os.path.split(csv_filename))
    name, ext = os.path.splitext(split[-1])
    split[-1] = 'cleaned_{}{}'.format(name, ext)
    cleaned_filename = os.path.join(*split)
    print('{} -> {}'.format(csv_filename, cleaned_filename))

    # Process CSV
    df = pd.read_csv(csv_filename, names=["press", "gph", "temp", "wspd"])
    df = df.drop(['gph'], axis=1)
    df.to_csv(cleaned_filename, sep=',', index=False)         

if __name__ == '__main__':
    with Pool(10) as pool:
        print(pool.map(clean_csv, glob.glob('cleaning/*.csv')))

How can I parallize my Python program to clean a dataset with 2000 csv files using Pandas dataframes?

Question

1 answers

solution1
1 2018-03-07 10:50:26

How can I parallize my Python program to clean a dataset with 2000 csv files using Pandas dataframes?

Question

1 answers

solution1 1 2018-03-07 10:50:26

solution1
1 2018-03-07 10:50:26