Faster method to extract information from complex nested json in python dataframe

Question

I have some complex nested json data saved in a python dataframe column and I am trying to flatten the data into columns. The functions and method I have tested seem very slow and would take days to process all my data, so not really a good option.

I am reproducing a simplified example of the data below, and repeating the records to demonstrate how the number of records affect the processing time. Note: not all records have information for all the existing columns.

Please let me know of any other ways to improve the speed of that process. Thanks

import pandas as pd
str_json= {"json_col":[ '{"DayId":1,"Details":[{"LocationId":101,"Items":[{"Stock":[["A1",100],["A2",105],["A3",90],["A4",85]],"Product":"A"},{"Stock":[["B1",220],["B2",240]],"Product":"B"},{"Stock":[["C1",50]],"Product":"C"},{"Sold":[["A1",5],["A2",8],["A3",4]],"Product":"A"},{"Sold":[["C1",12]],"Product":"C"}]}]}','{"DayId":2,"Details":[{"LocationId":101,"Items":[{"Stock":[["D1",150],["D2",145],["D3",130]],"Product":"D"}]}]}','{"DayId":3,"Details":[{"LocationId":101,"Items":[{"Stock":[["A2",97],["A5",90]],"Product":"A"},{"Stock":[["E1",25],["E2",30],["E3",22],["E4",30]],"Product":"E"},{"Sold":[["B2",32]],"Product":"B"},{"Sold":[["D1",20],["D3",15]],"Product":"D"},{"Sold":[["E2",4],["E3",1],["E4",2]],"Product":"E"}]}]}']}


small_df = pd.DataFrame(str_json)

Example of small_df:

Creating a relatively bigger dataframe to illustrate how processing speed is affected with more records:

bigger_df = pd.DataFrame(np.repeat(small_df.values, 1000, axis=0), columns=small_df.columns)

First function, step 1 (very slow). Extracting columns day_id, location_id and details_items:

def extract_details(row):
   data = row.to_string(header=False, index=False)
   json_data = json.loads(data)
   result = pd.json_normalize(json_data)
   day_id = result.iloc[0].loc['DayId']    
   details = result.iloc[0].loc['Details']

   details_df = pd.json_normalize(details)   
   location_id = details_df.iloc[0].loc['LocationId']
   details_items = details_df.iloc[0].loc['Items']

   return day_id, location_id, details_items


%%time
step1_df= bigger_df.copy()
step1_df[['day_id', 'location_id', 'details_items']]= step1_df.apply(lambda row: extract_details(row), axis=1, result_type ='expand')
step1_df.drop('json_col', axis=1, inplace=True)

Example of step 1 running time and output:

Second function, step 2 (relatively fast):

def flatten_nested_json_df(df):

df = df.reset_index()

print(f"original shape: {df.shape}")
print(f"original columns: {df.columns}")


# search for columns to explode/flatten
s = (df.applymap(type) == list).all()
list_columns = s[s].index.tolist()

s = (df.applymap(type) == dict).all()
dict_columns = s[s].index.tolist()

print(f"lists: {list_columns}, dicts: {dict_columns}")
while len(list_columns) > 0 or len(dict_columns) > 0:
    new_columns = []

    for col in dict_columns:
        print(f"flattening: {col}")
        # explode dictionaries horizontally, adding new columns
        horiz_exploded = pd.json_normalize(df[col]).add_prefix(f'{col}.')
        horiz_exploded.index = df.index
        df = pd.concat([df, horiz_exploded], axis=1).drop(columns=[col])
        new_columns.extend(horiz_exploded.columns) # inplace

    for col in list_columns:
        print(f"exploding: {col}")
        # explode lists vertically, adding new columns
        df = df.drop(columns=[col]).join(df[col].explode().to_frame())
        new_columns.append(col)

    # check if there are still dict o list fields to flatten
    s = (df[new_columns].applymap(type) == list).all()
    list_columns = s[s].index.tolist()

    s = (df[new_columns].applymap(type) == dict).all()
    dict_columns = s[s].index.tolist()

    print(f"lists: {list_columns}, dicts: {dict_columns}")

print(f"final shape: {df.shape}")
print(f"final columns: {df.columns}")
return df

Example of step 2 running time and output:

Answer 1

here's a different approach. Put in place a parser / transformer that generates a dict that can be flattened by json_normalize() in a single call
output structure is generic fully flattened, hence column names include index of embedded list
timings below - it's not clear to me what timings you are seeing
opportunities for further optimisation of output from transformer (output less). However this would make it less generic

create specialised JSON parser

from lark import Transformer
from lark import Lark

json_grammar = r"""
    ?value: dict
          | list
          | string
          | SIGNED_NUMBER      -> number
          | "true"             -> true
          | "false"            -> false
          | "null"             -> null

    list : "[" [value ("," value)*] "]"

    dict : "{" [pair ("," pair)*] "}"
    pair : string ":" value

    string : ESCAPED_STRING

    %import common.ESCAPED_STRING
    %import common.SIGNED_NUMBER
    %import common.WS
    %ignore WS

    """
class TreeToJson(Transformer):
    def string(self, s):
        (s,) = s
        return s[1:-1]
    def number(self, n):
        (n,) = n
        return float(n)
    # convert embedded lists to embedded dicts
    def list(self, l):
        return {str(i):v for i,v in enumerate(l)}
    pair = tuple
    dict = dict 

    null = lambda self, _: None
    true = lambda self, _: True
    false = lambda self, _: False

json_parser = Lark(json_grammar, start='value', parser='lalr', transformer=TreeToJson())

app code

import pandas as pd
import numpy as np
str_json= {"json_col":[ '{"DayId":1,"Details":[{"LocationId":101,"Items":[{"Stock":[["A1",100],["A2",105],["A3",90],["A4",85]],"Product":"A"},{"Stock":[["B1",220],["B2",240]],"Product":"B"},{"Stock":[["C1",50]],"Product":"C"},{"Sold":[["A1",5],["A2",8],["A3",4]],"Product":"A"},{"Sold":[["C1",12]],"Product":"C"}]}]}','{"DayId":2,"Details":[{"LocationId":101,"Items":[{"Stock":[["D1",150],["D2",145],["D3",130]],"Product":"D"}]}]}','{"DayId":3,"Details":[{"LocationId":101,"Items":[{"Stock":[["A2",97],["A5",90]],"Product":"A"},{"Stock":[["E1",25],["E2",30],["E3",22],["E4",30]],"Product":"E"},{"Sold":[["B2",32]],"Product":"B"},{"Sold":[["D1",20],["D3",15]],"Product":"D"},{"Sold":[["E2",4],["E3",1],["E4",2]],"Product":"E"}]}]}']}

small_df = pd.DataFrame(str_json)

def wide(js):
    df = pd.json_normalize(json_parser.parse(js))
    return pd.Series(df.values[0], index=df.columns)

bigger_df = pd.DataFrame(np.repeat(small_df.values, 1000, axis=0), columns=small_df.columns)
small_df["json_col"].apply(wide)

output

	DayId	Details.0.LocationId	Details.0.Items.0.Stock.0.0	Details.0.Items.0.Stock.0.1	Details.0.Items.0.Stock.1.0	Details.0.Items.0.Stock.1.1	Details.0.Items.0.Stock.2.0	Details.0.Items.0.Stock.2.1	Details.0.Items.0.Stock.3.0	Details.0.Items.0.Stock.3.1	Details.0.Items.0.Product	Details.0.Items.1.Stock.0.0	Details.0.Items.1.Stock.0.1	Details.0.Items.1.Stock.1.0	Details.0.Items.1.Stock.1.1	Details.0.Items.1.Product	Details.0.Items.2.Stock.0.0	Details.0.Items.2.Stock.0.1	Details.0.Items.2.Product	Details.0.Items.3.Sold.0.0	Details.0.Items.3.Sold.0.1	Details.0.Items.3.Sold.1.0	Details.0.Items.3.Sold.1.1	Details.0.Items.3.Sold.2.0	Details.0.Items.3.Sold.2.1	Details.0.Items.3.Product	Details.0.Items.4.Sold.0.0	Details.0.Items.4.Sold.0.1	Details.0.Items.4.Product	Details.0.Items.1.Stock.2.0	Details.0.Items.1.Stock.2.1	Details.0.Items.1.Stock.3.0	Details.0.Items.1.Stock.3.1	Details.0.Items.2.Sold.0.0	Details.0.Items.2.Sold.0.1	Details.0.Items.4.Sold.1.0	Details.0.Items.4.Sold.1.1	Details.0.Items.4.Sold.2.0	Details.0.Items.4.Sold.2.1
0	1	101	A1	100	A2	105	A3	90	A4	85	A	B1	220	B2	240	B	C1	50	C	A1	5	A2	8	A3	4	A	C1	12	C	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
1	2	101	D1	150	D2	145	D3	130	nan	nan	D	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
2	3	101	A2	97	A5	90	nan	nan	nan	nan	A	E1	25	E2	30	E	nan	nan	B	D1	20	D3	15	nan	nan	D	E2	4	E	E3	22	E4	30	B2	32	E3	1	E4	2

timing

parsing and transforming < 1ms
parsing, transforming and creation of series ~2ms
parsing, transforming, creation of series and results in data frame ~3.5ms
processing 3000 rows, 5.5s

%timeit json_parser.parse(small_df.values[0][0])
%timeit wide(small_df.values[0][0])
%timeit small_df.loc[0, ["json_col"]].apply(wide)
%timeit bigger_df["json_col"].apply(wide)
864 µs ± 8.22 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.99 ms ± 83.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.55 ms ± 117 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5.49 s ± 529 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Faster method to extract information from complex nested json in python dataframe

Question

1 answers

solution1
1 2021-08-13 14:43:39

create specialised JSON parser

app code

output

timing

Faster method to extract information from complex nested json in python dataframe

Question

1 answers

solution1 1 2021-08-13 14:43:39

create specialised JSON parser

app code

output

timing

solution1
1 2021-08-13 14:43:39