Memory error while processing large dataframes Python

Question

I have a large csv file (~5-10 GB)which I'm converting to pandas dataframe, processing it and converting back to csv file.

Following is my code

# Convert csv to dataframe
df = pd.read_csv('A.csv')

#Sort Dataframe Columns
df.columns = df.columns.str.lower()
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace(' ','_')
df.columns = df.columns.str.replace('-','_')
df.columns = df.columns.str.replace(':',"_")

#Remove duplicate columns in the dataframe
df = df.loc[:,~df.columns.duplicated()]

#Drop rows with null date which are aggregate values
df = df[pd.notnull(df['date'])]

## Formatting and handling headers

# Read headers of the dataframe into headers_new_unsorted variable
headers_new_unsorted = df.columns
headers_new_unsorted = [x.encode('UTF8') for x in headers_new_unsorted]           

i = 0
while i<len(headers_new_unsorted):
    headers_new_unsorted[i] = headers_new_unsorted[i].lower()
    headers_new_unsorted[i] = headers_new_unsorted[i].strip()
    headers_new_unsorted[i] = headers_new_unsorted[i].replace(" ","_")
    headers_new_unsorted[i] = headers_new_unsorted[i].replace("-","_")
    headers_new_unsorted[i] = headers_new_unsorted[i].replace(".","_")
    headers_new_unsorted[i] = headers_new_unsorted[i].replace(":","_")
    i += 1

headers_new = list(unique_everseen(headers_new_unsorted))

#If headers text file not present in the local folder create one
if not os.path.exists('head.txt'):
    file('head.txt', 'w').close()

file = open('head.txt','r')
baseline_headers = file.read().split('\n')

Dfrn_header = set(baseline_headers).symmetric_difference(headers_new) 
Dfrn_header = filter(None, Dfrn_header)

#sort columns
for i in Dfrn_header:
    if i not in baseline_headers:
        df[i] = df[i] 
    else:
        df[i] = np.nan

organize_headers = baseline_headers

#append newly added columns to organize headers local list variable
for i in headers_new:
    if i not in baseline_headers:
        organize_headers = organize_headers+[i]

organize_headers = filter(None, organize_headers)
print organize_headers
new_headers_added = set(organize_headers) - set(baseline_headers)
new_headers_added = [o for o in organize_headers if o in new_headers_added]

#Organize dataframe columns same as organize header list
df = df[organize_headers]

#** Start Data processing **#

#Replace all null values to None
df["A"].fillna("None", inplace=True)
df["P"].fillna("None", inplace=True)
df["C"].fillna("None", inplace=True)
df["D"].fillna("None", inplace = True)

#Modify cities based on States
df.loc[df.C.str.startswith('New York'), 'State'] = "NY"
df.loc[df.C.str.startswith('San Jose'), 'State'] = "California"
df.loc[df.C.str.startswith('Portland'), 'State'] = "Oregon"
df.loc[df.C.str.startswith('Arlington'), 'State'] = "Texas"
df.loc[df.C.str.startswith('San Diego'), 'State'] = "California"
df.loc[df.C.str.startswith('LA'), 'State'] = "California"
df.loc[df.C.str.startswith('Rolla'), 'State'] = "Missouri"
df.loc[df.C.str.startswith('Detroit'), 'State'] = "MI"
df.loc[df.C.str.startswith('Chicago'), 'State'] = "IL"
df.loc[df.C.str.startswith('Louisville'), 'State'] = "Kentucky"
df.loc[df.C.str.startswith('San Francisco'), 'State'] = "California"
df["State"].fillna("None", inplace = True)

ref_data = json.load(open('test.json'))

json_len = len(ref_data)

for i in range(0,json_len):
    for j in range (0,len(ref_data[i])):
        if str(ref_data[i].keys()[j]) != "A":
            df.loc[df[str(ref_data[i].keys()[j])].isnull() & (df.A.astype(str).str.contains(str(ref_data[0]["A"]))), str(ref_data[i].keys()[j])] = str(ref_data[i].values()[j])

#** End of Data Processing **#

df.to_csv('processed.csv',sep=',', index = False)

I'm running out of memory while processing large files. I have already increased the RAM size of the machine on which I'm running this code. Increasing more is not feasible. How can I lower memory usage?

Answer 1

Pandas' read_csv has a skiprows parameter that lets you select which rows to read into memory. You can just read chunks of your gigantic file into memory and process them separately.

Example of the above. For the sake of simplicity just say that you did all of your processing in the process() function. Lets also say that your csv is 1000000 lines long.

from csv import writer
for i in range(0, 10):
    skipfunc = lambda x: (i-1) * 100000<= x < i*100000
    df = pd.read_csv('A.csv', skiprows = skipfunc) #only process 1/10 of csv at a time
    processed_df = process(df)
    list = df.to_csv(sep=',', index=False)
    with open('processed.csv', 'w') as f:
        w = writer(f)
        f.writerows(list)

Memory error while processing large dataframes Python

Question

1 answers

solution1
1 2018-04-05 20:38:59

Memory error while processing large dataframes Python

Question

1 answers

solution1 1 2018-04-05 20:38:59

solution1
1 2018-04-05 20:38:59