More Efficient way to parse JSON and convert to CSV in Python

Question

I'm not sure if this will be enough information to provide but I am currently trying to extract and format into csv a subset of data from a very large file containing many JSON objects as lines and dumping it into one csv file. I have the below implementation. Speed isn't too bad but I was wondering if there is a more efficient way to do this. I feel like the pandas part where I create dataframes can be a little nicer:

for files in zip_files:
    with zipfile.ZipFile(files, 'r') as myzip:
        for logfile in myzip.namelist():
            list1 = []
            list2 = []
            f = myzip.open(logfile)
            contents = f.readlines()
            for line in contents[:]:
                try:
                    parsed = json.loads(line[:-2])
                    if "key1" in parsed.keys():
                        if "val1" in parsed['key1']['key2']:
                            if "val2" in parsed['key3']:
                                list1.append(parsed['key1'])
                                list2.append(parsed['key3'])
                except ValueError as e:
                    pass
                else:
                    pass
            df1 = pd.DataFrame(list1)
            df2 = pd.DataFrame(list2)
            df3 = df2.join(df1)
            df3['col1'] = df3['col1'].apply(lambda x: ','.join([str(i) for i in x]))
            df3 = df3.drop_duplicates()
       with open(csvout, 'a') as f2:
            df.to_csv(f2, header=None, index=False)
            f2.close()
       f.close()

Answer 1

I did the following:

Annotated the original version with STRANGE (= I am not sure what you are doing), EFFICIENT (= can be made more efficient), SIMPLIFY (=can be made simpler).
Created two other versions that may be more efficient, but change behavior (in terms of memory, or execution behavior). This is elaborated below.

To verify that these suggestions are actually helpful, consider using ipython %timeit . Enter the following at the IPython prompt:

In [0]: %timeit -n<N> %run script.py

Where <N> is the number of runs to average over (1000 by default, which might take too long).

Annotated Original

for files in zip_files:
    with zipfile.ZipFile(files, 'r') as myzip:
        for logfile in myzip.namelist():
            list1 = []
            list2 = []
            f = myzip.open(logfile)

            # contents = f.readlines()
            # for line in contents[:]:
            for line in f: # EFFICIENT: does the same without making a copy

                try:
                    parsed = json.loads(line[:-2])

                    # if "key1" in parsed.keys():
                    if "key1" in parsed: # EFFICIENT: no copy

                        # STRANGE: 'val' in dict checks for key existence by
                        # default, are you sure this is what you want?
                        if "val1" in parsed['key1']['key2']:
                            if "val2" in parsed['key3']:
                                list1.append(parsed['key1'])
                                list2.append(parsed['key3'])
                except ValueError as e:
                    pass
                # STRANGE: Why is this here?
                # else:
                    #     pass
            df1 = pd.DataFrame(list1)
            df2 = pd.DataFrame(list2)
            df3 = df2.join(df1)

            # EFFICIENT: prefer generator over list comprehension
            # df3['col1'] = df3['col1'].apply(lambda x: ','.join([str(i) for i in x]))
            df3['col1'] = df3['col1'].apply(lambda x: ','.join(str(i) for i in x))

            df3.drop_duplicates(inplace=True)

       # SIMPLIFY:
           # with open(csvout, 'a') as f2:
               #     df.to_csv(f2, header=None, index=False)
       #      f2.close()
       # STRANGE: where does `df` come from? Shouldn't this be df3?
       df.to_csv(csvout, mode='a', header=None, index=False)

       # STRANGE: you open f in a loop, but close it outside of the loop?
       f.close()

Build in Memory, Write Once

If you have enough memory, the following might be faster: rather than appending to the file you concatenate all files in memory first. This also changes the behavior slightly:

duplicates are filtered across all files

Also some stylistic changes:

for files in zip_files:
    with zipfile.ZipFile(files, 'r') as myzip:
        list1, list2 = [], [] # Notice these are outside the loop
        for logfile in myzip.namelist():
            with myzip.open(logfile) as f:
                for line in f:
                    try:
                        parsed = json.loads(line[:-2])
                    except ValueError as e: # Presumably we only wish to catch json value errors
                        pass
                    else:
                        if ("key1" in parsed
                            and "val1" in parsed['key1']['key2']
                            and "val2" in parsed['key3']):
                            list1.append(parsed['key1'])
                            list2.append(parsed['key3'])

        # Write only once
        df = pd.DataFrame(list2).join(pd.DataFrame(list1))
        df['col1'] = df['col1'].apply(lambda x: ','.join(str(i) for i in x))
        df.drop_duplicates(inplace=True)
        df.to_csv(csvout, header=None, index=False)

Build in Memory, Write Once, Filter Duplicates Only Per File

Keeping the duplicate filtering local to each file:

for files in zip_files:
    with zipfile.ZipFile(files, 'r') as myzip:
        dfs = []
        for logfile in myzip.namelist():
            list1, list2 = [], []
            with myzip.open(logfile) as f:
                for line in f:
                    try:
                        parsed = json.loads(line[:-2])
                    except ValueError as e: # Presumably we only wish to catch json value errors
                        pass
                    else:
                        if ("key1" in parsed
                            and "val1" in parsed['key1']['key2']
                            and "val2" in parsed['key3']):
                            list1.append(parsed['key1'])
                            list2.append(parsed['key3'])
            # Build a temporary dataframe to filter the duplicates:
            tmp = pd.DataFrame(list2).join(pd.DataFrame(list1))
            tmp['col1'] = tmp['col1'].apply(lambda x: ','.join(str(i) for i in x))
            tmp.drop_duplicates(inplace=True)
            dfs.append(tmp)

        # Write only once
        pd.concat(dfs, ignore_index=True).to_csv(csvout, header=None, index=False)

More Efficient way to parse JSON and convert to CSV in Python

Question

1 answers

solution1
1 ACCPTED 2014-06-22 10:23:34

Annotated Original

Build in Memory, Write Once

Build in Memory, Write Once, Filter Duplicates Only Per File

More Efficient way to parse JSON and convert to CSV in Python

Question

1 answers

solution1 1 ACCPTED 2014-06-22 10:23:34

Annotated Original

Build in Memory, Write Once

Build in Memory, Write Once, Filter Duplicates Only Per File

solution1
1 ACCPTED 2014-06-22 10:23:34