简体   繁体   中英

Faster method to extract information from complex nested json in python dataframe

I have some complex nested json data saved in a python dataframe column and I am trying to flatten the data into columns. The functions and method I have tested seem very slow and would take days to process all my data, so not really a good option.

I am reproducing a simplified example of the data below, and repeating the records to demonstrate how the number of records affect the processing time. Note: not all records have information for all the existing columns.

Please let me know of any other ways to improve the speed of that process. Thanks

import pandas as pd
str_json= {"json_col":[ '{"DayId":1,"Details":[{"LocationId":101,"Items":[{"Stock":[["A1",100],["A2",105],["A3",90],["A4",85]],"Product":"A"},{"Stock":[["B1",220],["B2",240]],"Product":"B"},{"Stock":[["C1",50]],"Product":"C"},{"Sold":[["A1",5],["A2",8],["A3",4]],"Product":"A"},{"Sold":[["C1",12]],"Product":"C"}]}]}','{"DayId":2,"Details":[{"LocationId":101,"Items":[{"Stock":[["D1",150],["D2",145],["D3",130]],"Product":"D"}]}]}','{"DayId":3,"Details":[{"LocationId":101,"Items":[{"Stock":[["A2",97],["A5",90]],"Product":"A"},{"Stock":[["E1",25],["E2",30],["E3",22],["E4",30]],"Product":"E"},{"Sold":[["B2",32]],"Product":"B"},{"Sold":[["D1",20],["D3",15]],"Product":"D"},{"Sold":[["E2",4],["E3",1],["E4",2]],"Product":"E"}]}]}']}


small_df = pd.DataFrame(str_json)

Example of small_df: 在此处输入图像描述

Creating a relatively bigger dataframe to illustrate how processing speed is affected with more records:

bigger_df = pd.DataFrame(np.repeat(small_df.values, 1000, axis=0), columns=small_df.columns)

First function, step 1 (very slow). Extracting columns day_id, location_id and details_items:

def extract_details(row):
   data = row.to_string(header=False, index=False)
   json_data = json.loads(data)
   result = pd.json_normalize(json_data)
   day_id = result.iloc[0].loc['DayId']    
   details = result.iloc[0].loc['Details']

   details_df = pd.json_normalize(details)   
   location_id = details_df.iloc[0].loc['LocationId']
   details_items = details_df.iloc[0].loc['Items']

   return day_id, location_id, details_items


%%time
step1_df= bigger_df.copy()
step1_df[['day_id', 'location_id', 'details_items']]= step1_df.apply(lambda row: extract_details(row), axis=1, result_type ='expand')
step1_df.drop('json_col', axis=1, inplace=True)

Example of step 1 running time and output: 在此处输入图像描述

Second function, step 2 (relatively fast):

def flatten_nested_json_df(df):

df = df.reset_index()

print(f"original shape: {df.shape}")
print(f"original columns: {df.columns}")


# search for columns to explode/flatten
s = (df.applymap(type) == list).all()
list_columns = s[s].index.tolist()

s = (df.applymap(type) == dict).all()
dict_columns = s[s].index.tolist()

print(f"lists: {list_columns}, dicts: {dict_columns}")
while len(list_columns) > 0 or len(dict_columns) > 0:
    new_columns = []

    for col in dict_columns:
        print(f"flattening: {col}")
        # explode dictionaries horizontally, adding new columns
        horiz_exploded = pd.json_normalize(df[col]).add_prefix(f'{col}.')
        horiz_exploded.index = df.index
        df = pd.concat([df, horiz_exploded], axis=1).drop(columns=[col])
        new_columns.extend(horiz_exploded.columns) # inplace

    for col in list_columns:
        print(f"exploding: {col}")
        # explode lists vertically, adding new columns
        df = df.drop(columns=[col]).join(df[col].explode().to_frame())
        new_columns.append(col)

    # check if there are still dict o list fields to flatten
    s = (df[new_columns].applymap(type) == list).all()
    list_columns = s[s].index.tolist()

    s = (df[new_columns].applymap(type) == dict).all()
    dict_columns = s[s].index.tolist()

    print(f"lists: {list_columns}, dicts: {dict_columns}")

print(f"final shape: {df.shape}")
print(f"final columns: {df.columns}")
return df

Example of step 2 running time and output: 在此处输入图像描述

  • here's a different approach. Put in place a parser / transformer that generates a dict that can be flattened by json_normalize() in a single call
  • output structure is generic fully flattened, hence column names include index of embedded list
  • timings below - it's not clear to me what timings you are seeing
  • opportunities for further optimisation of output from transformer (output less). However this would make it less generic

create specialised JSON parser

from lark import Transformer
from lark import Lark

json_grammar = r"""
    ?value: dict
          | list
          | string
          | SIGNED_NUMBER      -> number
          | "true"             -> true
          | "false"            -> false
          | "null"             -> null

    list : "[" [value ("," value)*] "]"

    dict : "{" [pair ("," pair)*] "}"
    pair : string ":" value

    string : ESCAPED_STRING

    %import common.ESCAPED_STRING
    %import common.SIGNED_NUMBER
    %import common.WS
    %ignore WS

    """
class TreeToJson(Transformer):
    def string(self, s):
        (s,) = s
        return s[1:-1]
    def number(self, n):
        (n,) = n
        return float(n)
    # convert embedded lists to embedded dicts
    def list(self, l):
        return {str(i):v for i,v in enumerate(l)}
    pair = tuple
    dict = dict 

    null = lambda self, _: None
    true = lambda self, _: True
    false = lambda self, _: False

json_parser = Lark(json_grammar, start='value', parser='lalr', transformer=TreeToJson())

app code

import pandas as pd
import numpy as np
str_json= {"json_col":[ '{"DayId":1,"Details":[{"LocationId":101,"Items":[{"Stock":[["A1",100],["A2",105],["A3",90],["A4",85]],"Product":"A"},{"Stock":[["B1",220],["B2",240]],"Product":"B"},{"Stock":[["C1",50]],"Product":"C"},{"Sold":[["A1",5],["A2",8],["A3",4]],"Product":"A"},{"Sold":[["C1",12]],"Product":"C"}]}]}','{"DayId":2,"Details":[{"LocationId":101,"Items":[{"Stock":[["D1",150],["D2",145],["D3",130]],"Product":"D"}]}]}','{"DayId":3,"Details":[{"LocationId":101,"Items":[{"Stock":[["A2",97],["A5",90]],"Product":"A"},{"Stock":[["E1",25],["E2",30],["E3",22],["E4",30]],"Product":"E"},{"Sold":[["B2",32]],"Product":"B"},{"Sold":[["D1",20],["D3",15]],"Product":"D"},{"Sold":[["E2",4],["E3",1],["E4",2]],"Product":"E"}]}]}']}

small_df = pd.DataFrame(str_json)

def wide(js):
    df = pd.json_normalize(json_parser.parse(js))
    return pd.Series(df.values[0], index=df.columns)

bigger_df = pd.DataFrame(np.repeat(small_df.values, 1000, axis=0), columns=small_df.columns)
small_df["json_col"].apply(wide)

output

DayId Details.0.LocationId Details.0.Items.0.Stock.0.0 Details.0.Items.0.Stock.0.1 Details.0.Items.0.Stock.1.0 Details.0.Items.0.Stock.1.1 Details.0.Items.0.Stock.2.0 Details.0.Items.0.Stock.2.1 Details.0.Items.0.Stock.3.0 Details.0.Items.0.Stock.3.1 Details.0.Items.0.Product Details.0.Items.1.Stock.0.0 Details.0.Items.1.Stock.0.1 Details.0.Items.1.Stock.1.0 Details.0.Items.1.Stock.1.1 Details.0.Items.1.Product Details.0.Items.2.Stock.0.0 Details.0.Items.2.Stock.0.1 Details.0.Items.2.Product Details.0.Items.3.Sold.0.0 Details.0.Items.3.Sold.0.1 Details.0.Items.3.Sold.1.0 Details.0.Items.3.Sold.1.1 Details.0.Items.3.Sold.2.0 Details.0.Items.3.Sold.2.1 Details.0.Items.3.Product Details.0.Items.4.Sold.0.0 Details.0.Items.4.Sold.0.1 Details.0.Items.4.Product Details.0.Items.1.Stock.2.0 Details.0.Items.1.Stock.2.1 Details.0.Items.1.Stock.3.0 Details.0.Items.1.Stock.3.1 Details.0.Items.2.Sold.0.0 Details.0.Items.2.Sold.0.1 Details.0.Items.4.Sold.1.0 Details.0.Items.4.Sold.1.1 Details.0.Items.4.Sold.2.0 Details.0.Items.4.Sold.2.1
0 1 101 A1 100 A2 105 A3 90 A4 85 A B1 220 B2 240 B C1 50 C A1 5 A2 8 A3 4 A C1 12 C nan nan nan nan nan nan nan nan nan nan
1 2 101 D1 150 D2 145 D3 130 nan nan D nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
2 3 101 A2 97 A5 90 nan nan nan nan A E1 25 E2 30 E nan nan B D1 20 D3 15 nan nan D E2 4 E E3 22 E4 30 B2 32 E3 1 E4 2

timing

  • parsing and transforming < 1ms
  • parsing, transforming and creation of series ~2ms
  • parsing, transforming, creation of series and results in data frame ~3.5ms
  • processing 3000 rows, 5.5s
%timeit json_parser.parse(small_df.values[0][0])
%timeit wide(small_df.values[0][0])
%timeit small_df.loc[0, ["json_col"]].apply(wide)
%timeit bigger_df["json_col"].apply(wide)
864 µs ± 8.22 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.99 ms ± 83.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.55 ms ± 117 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5.49 s ± 529 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM