I'm not sure if this will be enough information to provide but I am currently trying to extract and format into csv a subset of data from a very large file containing many JSON objects as lines and dumping it into one csv file. I have the below implementation. Speed isn't too bad but I was wondering if there is a more efficient way to do this. I feel like the pandas part where I create dataframes can be a little nicer:
for files in zip_files:
with zipfile.ZipFile(files, 'r') as myzip:
for logfile in myzip.namelist():
list1 = []
list2 = []
f = myzip.open(logfile)
contents = f.readlines()
for line in contents[:]:
try:
parsed = json.loads(line[:-2])
if "key1" in parsed.keys():
if "val1" in parsed['key1']['key2']:
if "val2" in parsed['key3']:
list1.append(parsed['key1'])
list2.append(parsed['key3'])
except ValueError as e:
pass
else:
pass
df1 = pd.DataFrame(list1)
df2 = pd.DataFrame(list2)
df3 = df2.join(df1)
df3['col1'] = df3['col1'].apply(lambda x: ','.join([str(i) for i in x]))
df3 = df3.drop_duplicates()
with open(csvout, 'a') as f2:
df.to_csv(f2, header=None, index=False)
f2.close()
f.close()
I did the following:
STRANGE
(= I am not sure what you are doing), EFFICIENT
(= can be made more efficient), SIMPLIFY
(=can be made simpler). To verify that these suggestions are actually helpful, consider using ipython %timeit
. Enter the following at the IPython prompt:
In [0]: %timeit -n<N> %run script.py
Where <N>
is the number of runs to average over (1000 by default, which might take too long).
for files in zip_files:
with zipfile.ZipFile(files, 'r') as myzip:
for logfile in myzip.namelist():
list1 = []
list2 = []
f = myzip.open(logfile)
# contents = f.readlines()
# for line in contents[:]:
for line in f: # EFFICIENT: does the same without making a copy
try:
parsed = json.loads(line[:-2])
# if "key1" in parsed.keys():
if "key1" in parsed: # EFFICIENT: no copy
# STRANGE: 'val' in dict checks for key existence by
# default, are you sure this is what you want?
if "val1" in parsed['key1']['key2']:
if "val2" in parsed['key3']:
list1.append(parsed['key1'])
list2.append(parsed['key3'])
except ValueError as e:
pass
# STRANGE: Why is this here?
# else:
# pass
df1 = pd.DataFrame(list1)
df2 = pd.DataFrame(list2)
df3 = df2.join(df1)
# EFFICIENT: prefer generator over list comprehension
# df3['col1'] = df3['col1'].apply(lambda x: ','.join([str(i) for i in x]))
df3['col1'] = df3['col1'].apply(lambda x: ','.join(str(i) for i in x))
df3.drop_duplicates(inplace=True)
# SIMPLIFY:
# with open(csvout, 'a') as f2:
# df.to_csv(f2, header=None, index=False)
# f2.close()
# STRANGE: where does `df` come from? Shouldn't this be df3?
df.to_csv(csvout, mode='a', header=None, index=False)
# STRANGE: you open f in a loop, but close it outside of the loop?
f.close()
If you have enough memory, the following might be faster: rather than appending to the file you concatenate all files in memory first. This also changes the behavior slightly:
Also some stylistic changes:
for files in zip_files:
with zipfile.ZipFile(files, 'r') as myzip:
list1, list2 = [], [] # Notice these are outside the loop
for logfile in myzip.namelist():
with myzip.open(logfile) as f:
for line in f:
try:
parsed = json.loads(line[:-2])
except ValueError as e: # Presumably we only wish to catch json value errors
pass
else:
if ("key1" in parsed
and "val1" in parsed['key1']['key2']
and "val2" in parsed['key3']):
list1.append(parsed['key1'])
list2.append(parsed['key3'])
# Write only once
df = pd.DataFrame(list2).join(pd.DataFrame(list1))
df['col1'] = df['col1'].apply(lambda x: ','.join(str(i) for i in x))
df.drop_duplicates(inplace=True)
df.to_csv(csvout, header=None, index=False)
Keeping the duplicate filtering local to each file:
for files in zip_files:
with zipfile.ZipFile(files, 'r') as myzip:
dfs = []
for logfile in myzip.namelist():
list1, list2 = [], []
with myzip.open(logfile) as f:
for line in f:
try:
parsed = json.loads(line[:-2])
except ValueError as e: # Presumably we only wish to catch json value errors
pass
else:
if ("key1" in parsed
and "val1" in parsed['key1']['key2']
and "val2" in parsed['key3']):
list1.append(parsed['key1'])
list2.append(parsed['key3'])
# Build a temporary dataframe to filter the duplicates:
tmp = pd.DataFrame(list2).join(pd.DataFrame(list1))
tmp['col1'] = tmp['col1'].apply(lambda x: ','.join(str(i) for i in x))
tmp.drop_duplicates(inplace=True)
dfs.append(tmp)
# Write only once
pd.concat(dfs, ignore_index=True).to_csv(csvout, header=None, index=False)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.