Flatten Complex Nested JSON (PYSPARK)

Question

I am trying to flatten the below-nested JSON:

 root
 |-- id: string (nullable = true)
 |-- InsuranceProvider: string (nullable = true)
 |-- Type: struct (nullable = true)
 |    |-- Client: struct (nullable = true)
 |    |    |-- PaidIn: struct (nullable = true)
 |    |    |    |-- Insuranceid: string (nullable = true)
 |    |    |    |-- Insurancedesc: string (nullable = true)
 |    |    |    |-- purchaseditems: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- InsuranceNumber: string (nullable = true)
 |    |    |    |    |    |-- InsuranceLabel: string (nullable = true)
 |    |    |    |    |    |-- Insurancequantity: double (nullable = true)
 |    |    |    |    |    |-- Insuranceprice: integer (nullable = true)
 |    |    |    |    |    |-- discountsreceived: array (nullable = true)
 |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |-- amount: integer (nullable = true)
 |    |    |    |    |    |    |    |-- description: string (nullable = true)
 |    |    |    |    |    |-- childItems: array (nullable = true)
 |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |-- InsuranceNumber: string (nullable = true)
 |    |    |    |    |    |    |    |-- InsuranceLabel: string (nullable = true)
 |    |    |    |    |    |    |    |-- Insurancequantity: double (nullable = true)
 |    |    |    |    |    |    |    |-- Insuranceprice: integer (nullable = true)
 |    |    |    |    |    |    |    |-- discountsreceived: array (nullable = true)
 |    |    |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |    |    |-- amount: integer (nullable = true)
 |    |    |    |    |    |    |    |    |    |-- description: string (nullable = true)
 |-- eventTime: string (nullable = true)

I am trying to manually flatten it by exploding each array type and selecting the column required.

Below is my code:

df1 = purchaseDF.select(#col("InsuranceProvider"),
          col("eventTime"),
          col("id"),
          col("Type.Client.PaidIn.Insurancedesc").alias("Insurancedesc"),
          col("Type.Client.PaidIn.Insuranceid").alias("Insuranceid"),
          explode("Type.Client.PaidIn.purchaseditems").alias("purchaseditemsArray")
          )#.show(truncate=True)

df2 = df1.select("*",
    col("purchaseditemsArray.InsuranceLabel").alias("InsuranceLabel"),
    col("purchaseditemsArray.InsuranceNumber").alias("InsuranceNumber"),
    col("purchaseditemsArray.Insuranceprice").alias("Insuranceprice"),
    col("purchaseditemsArray.Insurancequantity").alias("Insurancequantity"),
    explode("purchaseditemsArray.childItems").alias("childItemsArray")
)#.drop("purchaseditemsArray")
#.show(truncate=False)

df3 = df2.select("*",
    col("childItemsArray.InsuranceLabel").alias("ChildInsuranceLabel"),
    col("childItemsArray.InsuranceNumber").alias("ChildInsuranceNumber"),
    col("childItemsArray.Insuranceprice").alias("ChildInsuranceprice"),
    col("childItemsArray.Insurancequantity").alias("ChildInsurancequantity"),
    explode("childItemsArray.discountsreceived").alias("discountsreceivedArray")
).drop("childItemsArray")
#.show(truncate=False)

df4 = df3.select("*",
    col("discountsreceivedArray.amount").alias("ChildItemDiscountAmount"),
    col("discountsreceivedArray.description").alias("ChildItemDiscountDescription")
).drop("discountsreceivedArray")
#.show(truncate=False)

discountReceivedDF = df4.select("*",
    explode("purchaseditemsArray.discountsreceived").alias("discountsreceivedArray")
)

finalDF = discountReceivedDF.select("*",
                         "discountsreceivedArray.amount",
                         "discountsreceivedArray.description"
                         ).drop("purchaseditemsArray").drop("discountsreceivedArray")

After writing the above I got all the columns separately. I tried the below to show the result:

finalDF.show(truncate= False)
finalDF.printSchema()

OUTPUT:

root
 |-- eventTime: string (nullable = true)
 |-- id: string (nullable = true)
 |-- Insurancedesc: string (nullable = true)
 |-- Insuranceid: string (nullable = true)
 |-- InsuranceLabel: string (nullable = true)
 |-- InsuranceNumber: string (nullable = true)
 |-- Insuranceprice: integer (nullable = true)
 |-- Insurancequantity: double (nullable = true)
 |-- ChildInsuranceLabel: string (nullable = true)
 |-- ChildInsuranceNumber: string (nullable = true)
 |-- ChildInsuranceprice: integer (nullable = true)
 |-- ChildInsurancequantity: double (nullable = true)
 |-- ChildItemDiscountAmount: integer (nullable = true)
 |-- ChildItemDiscountDescription: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- description: string (nullable = true)

My question is , Is there any alternative where I can optimize this nested JSON flattening thing? I looked up many blogs but didn't understand anything.

Below is the sample JSON Data:

[
    {
      "id": "164651478631223455788978942317",
      "InsuranceProvider": "Embroker",
      "Type": {
        "Client": {
          "PaidIn": {
            "Insuranceid": "97331549875122744335422",
            "Insurancedesc": "Magic happens here",
            "purchaseditems": [
              {
                "InsuranceNumber": "1",
                "InsuranceLabel": "DNO",
                "Insurancequantity": 1,
                "Insuranceprice": 345,
                "discountsreceived": [
                  {
                    "amount": 495,
                    "description": "Item 1, Discount 1"
                  }
                ],
                "childItems": [
                  {
                    "InsuranceNumber": "1",
                    "InsuranceLabel": "Cyber",
                    "Insurancequantity": 1,
                    "Insuranceprice": 0,
                    "discountsreceived": [
                      {
                        "amount": 2495,
                        "description": "Insurance item 1, Discount 1"
                      }
                    ]
                  }
                ]
              },
              {
                "InsuranceNumber": "2",
                "InsuranceLabel": "LPL",
                "Insurancequantity": 2,
                "Insuranceprice": 945,
                "discountsreceived": [
                  {
                    "amount": -295,
                    "description": "Item 2, Discount 1"
                  }
                ],
                "childItems": [
                  {
                    "InsuranceNumber": "1",
                    "InsuranceLabel": "Cyber",
                    "Insurancequantity": 1,
                    "Insuranceprice": 0,
                    "discountsreceived": [
                      {
                        "amount": 495,
                        "description": "Insurance item 2, Discount 1"
                      }
                    ]
                  }
                ]
              },
              {
                "InsuranceNumber": "3",
                "InsuranceLabel": "LPL",
                "Insurancequantity": 2,
                "Insuranceprice": 945,
                "discountsreceived": [
                  {
                    "amount": 295,
                    "description": "Item 2, Discount 1"
                  }
                ],
                "childItems": [
                  {
                    "InsuranceNumber": "1",
                    "InsuranceLabel": "Cyber",
                    "Insurancequantity": 1,
                    "Insuranceprice": 0,
                    "discountsreceived": [
                      {
                        "amount": 400,
                        "description": "Insurance item 2, Discount 1"
                      }
                    ]
                  }
                ]
              },
              {
                "InsuranceNumber": "4",
                "InsuranceLabel": "LPL",
                "Insurancequantity": 2,
                "Insuranceprice": 945,
                "discountsreceived": [
                  {
                    "amount": 295,
                    "description": "Item 2, Discount 1"
                  }
                ],
                "childItems": [
                  {
                    "InsuranceNumber": "1",
                    "InsuranceLabel": "Cyber",
                    "Insurancequantity": 1,
                    "Insuranceprice": 0,
                    "discountsreceived": [
                      {
                        "amount": 335,
                        "description": "Insurance item 2, Discount 1"
                      }
                    ]
                  }
                ]
              },
              {
                "InsuranceNumber": "5",
                "InsuranceLabel": "Employment Practices Liability",
                "Insurancequantity": 2,
                "Insuranceprice": 945,
                "discountsreceived": [
                  {
                    "amount": 1295,
                    "description": "Item 2, Discount 1"
                  }
                ],
                "childItems": [
                  {
                    "InsuranceNumber": "1",
                    "InsuranceLabel": "Cyber",
                    "Insurancequantity": 1,
                    "Insuranceprice": 0,
                    "discountsreceived": [
                      {
                        "amount": 195,
                        "description": "Insurance item 2, Discount 1"
                      }
                    ]
                  }
                ]
              },
              {
                "InsuranceNumber": "6",
                "InsuranceLabel": "Employment Practices Liability",
                "Insurancequantity": 2,
                "Insuranceprice": 945,
                "discountsreceived": [
                  {
                    "amount": 805,
                    "description": "Item 2, Discount 1"
                  }
                ],
                "childItems": [
                  {
                    "InsuranceNumber": "1",
                    "InsuranceLabel": "Cyber",
                    "Insurancequantity": 1,
                    "Insuranceprice": 0,
                    "discountsreceived": [
                      {
                        "amount": 501,
                        "description": "Insurance item 2, Discount 1"
                      }
                    ]
                  }
                ]
              },
              {
                "InsuranceNumber": "7",
                "InsuranceLabel": "Employment Practices Liability",
                "Insurancequantity": 2,
                "Insuranceprice": 945,
                "discountsreceived": [
                  {
                    "amount": 521,
                    "description": "Item 2, Discount 1"
                  }
                ],
                "childItems": [
                  {
                    "InsuranceNumber": "1",
                    "InsuranceLabel": "Cyber",
                    "Insurancequantity": 1,
                    "Insuranceprice": 0,
                    "discountsreceived": [
                      {
                        "amount": 533,
                        "description": "Insurance item 2, Discount 1"
                      }
                    ]
                  }
                ]
              },
              {
                "InsuranceNumber": "8",
                "InsuranceLabel": "Employment Practices Liability",
                "Insurancequantity": 2,
                "Insuranceprice": 945,
                "discountsreceived": [
                  {
                    "amount": 422,
                    "description": "Item 2, Discount 1"
                  }
                ],
                "childItems": [
                  {
                    "InsuranceNumber": "1",
                    "InsuranceLabel": "Cyber",
                    "Insurancequantity": 1,
                    "Insuranceprice": 0,
                    "discountsreceived": [
                      {
                        "amount": 333,
                        "description": "Insurance item 2, Discount 1"
                      }
                    ]
                  }
                ]
              },
              {
                "InsuranceNumber": "9",
                "InsuranceLabel": "Employment Practices Liability",
                "Insurancequantity": 2,
                "Insuranceprice": 945,
                "discountsreceived": [
                  {
                    "amount": 444,
                    "description": "Item 2, Discount 1"
                  }
                ],
                "childItems": [
                  {
                    "InsuranceNumber": "1",
                    "InsuranceLabel": "Cyber",
                    "Insurancequantity": 1,
                    "Insuranceprice": 0,
                    "discountsreceived": [
                      {
                        "amount": 666,
                        "description": "Insurance item 2, Discount 1"
                      }
                    ]
                  }
                ]
              },
              {
                "InsuranceNumber": "10",
                "InsuranceLabel": "DNO",
                "Insurancequantity": 2,
                "Insuranceprice": 945,
                "discountsreceived": [
                  {
                    "amount": 10,
                    "description": "Item 2, Discount 1"
                  }
                ],
                "childItems": [
                  {
                    "InsuranceNumber": "1",
                    "InsuranceLabel": "Cyber",
                    "Insurancequantity": 1,
                    "Insuranceprice": 0,
                    "discountsreceived": [
                      {
                        "amount": 63,
                        "description": "Insurance item 2, Discount 1"
                      }
                    ]
                  }
                ]
              }
            ]
          }
        }
      },
      "eventTime": "2020-05-19T01:59:10.379Z"
    }
  ]

Thank you so much in advance:)

Answer 1

df = spark.read.json("test.json", multiLine=True)



from pyspark.sql.functions import *
from pyspark.sql.types import *


def flatten_test(df, sep="_"):
    """Returns a flattened dataframe.
    .. versionadded:: x.X.X

    Parameters
    ----------
    sep : str
        Delimiter for flatted columns. Default `_`

    Notes
    -----
    Don`t use `.` as `sep`
    It won't work on nested data frames with more than one level.
    And you will have to use `columns.name`.

    Flattening Map Types will have to find every key in the column.
    This can be slow.

    Examples
    --------

    data_mixed = [
        {
            "state": "Florida",
            "shortname": "FL",
            "info": {"governor": "Rick Scott"},
            "counties": [
                {"name": "Dade", "population": 12345},
                {"name": "Broward", "population": 40000},
                {"name": "Palm Beach", "population": 60000},
            ],
        },
        {
            "state": "Ohio",
            "shortname": "OH",
            "info": {"governor": "John Kasich"},
            "counties": [
                {"name": "Summit", "population": 1234},
                {"name": "Cuyahoga", "population": 1337},
            ],
        },
    ]

    data_mixed = spark.createDataFrame(data=data_mixed)

    data_mixed.printSchema()

    root
    |-- counties: array (nullable = true)
    |    |-- element: map (containsNull = true)
    |    |    |-- key: string
    |    |    |-- value: string (valueContainsNull = true)
    |-- info: map (nullable = true)
    |    |-- key: string
    |    |-- value: string (valueContainsNull = true)
    |-- shortname: string (nullable = true)
    |-- state: string (nullable = true)


    data_mixed_flat = flatten_test(df, sep=":")
    data_mixed_flat.printSchema()
    root
    |-- shortname: string (nullable = true)
    |-- state: string (nullable = true)
    |-- counties:name: string (nullable = true)
    |-- counties:population: string (nullable = true)
    |-- info:governor: string (nullable = true)




    data = [
        {
            "id": 1,
            "name": "Cole Volk",
            "fitness": {"height": 130, "weight": 60},
        },
        {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
        {
            "id": 2,
            "name": "Faye Raker",
            "fitness": {"height": 130, "weight": 60},
        },
    ]


    df = spark.createDataFrame(data=data)

    df.printSchema()

    root
    |-- fitness: map (nullable = true)
    |    |-- key: string
    |    |-- value: long (valueContainsNull = true)
    |-- id: long (nullable = true)
    |-- name: string (nullable = true)

    df_flat = flatten_test(df, sep=":")

    df_flat.printSchema()

    root
    |-- id: long (nullable = true)
    |-- name: string (nullable = true)
    |-- fitness:height: long (nullable = true)
    |-- fitness:weight: long (nullable = true)

    data_struct = [
            (("James",None,"Smith"),"OH","M"),
            (("Anna","Rose",""),"NY","F"),
            (("Julia","","Williams"),"OH","F"),
            (("Maria","Anne","Jones"),"NY","M"),
            (("Jen","Mary","Brown"),"NY","M"),
            (("Mike","Mary","Williams"),"OH","M")
            ]


    schema = StructType([
        StructField('name', StructType([
            StructField('firstname', StringType(), True),
            StructField('middlename', StringType(), True),
            StructField('lastname', StringType(), True)
            ])),
        StructField('state', StringType(), True),
        StructField('gender', StringType(), True)
        ])

    df_struct = spark.createDataFrame(data = data_struct, schema = schema)

    df_struct.printSchema()

    root
    |-- name: struct (nullable = true)
    |    |-- firstname: string (nullable = true)
    |    |-- middlename: string (nullable = true)
    |    |-- lastname: string (nullable = true)
    |-- state: string (nullable = true)
    |-- gender: string (nullable = true)

    df_struct_flat = flatten_test(df_struct, sep=":")

    df_struct_flat.printSchema()

    root
    |-- state: string (nullable = true)
    |-- gender: string (nullable = true)
    |-- name:firstname: string (nullable = true)
    |-- name:middlename: string (nullable = true)
    |-- name:lastname: string (nullable = true)
    """
    # compute Complex Fields (Arrays, Structs and Maptypes) in Schema
    complex_fields = dict(
        [
            (field.name, field.dataType)
            for field in df.schema.fields
            if type(field.dataType) == ArrayType
            or type(field.dataType) == StructType
            or type(field.dataType) == MapType
        ]
    )

    while len(complex_fields) != 0:
        col_name = list(complex_fields.keys())[0]
        # print ("Processing :"+col_name+" Type : "+str(type(complex_fields[col_name])))

        # if StructType then convert all sub element to columns.
        # i.e. flatten structs
        if type(complex_fields[col_name]) == StructType:
            expanded = [
                col(col_name + "." + k).alias(col_name + sep + k)
                for k in [n.name for n in complex_fields[col_name]]
            ]
            df = df.select("*", *expanded).drop(col_name)

        # if ArrayType then add the Array Elements as Rows using the explode function
        # i.e. explode Arrays
        elif type(complex_fields[col_name]) == ArrayType:
            df = df.withColumn(col_name, explode_outer(col_name))

        # if MapType then convert all sub element to columns.
        # i.e. flatten
        elif type(complex_fields[col_name]) == MapType:
            keys_df = df.select(explode_outer(map_keys(col(col_name)))).distinct()
            keys = list(map(lambda row: row[0], keys_df.collect()))
            key_cols = list(
                map(
                    lambda f: col(col_name).getItem(f).alias(str(col_name + sep + f)),
                    keys,
                )
            )
            drop_column_list = [col_name]
            df = df.select(
                [
                    col_name
                    for col_name in df.columns
                    if col_name not in drop_column_list
                ]
                + key_cols
            )

        # recompute remaining Complex Fields in Schema
        complex_fields = dict(
            [
                (field.name, field.dataType)
                for field in df.schema.fields
                if type(field.dataType) == ArrayType
                or type(field.dataType) == StructType
                or type(field.dataType) == MapType
            ]
        )

    return df



df_flat = flatten_test(df)
df_flat.printSchema()

root
 |-- InsuranceProvider: string (nullable = true)
 |-- eventTime: string (nullable = true)
 |-- id: string (nullable = true)
 |-- Type_Client_PaidIn_Insurancedesc: string (nullable = true)
 |-- Type_Client_PaidIn_Insuranceid: string (nullable = true)
 |-- Type_Client_PaidIn_purchaseditems_InsuranceLabel: string (nullable = true)
 |-- Type_Client_PaidIn_purchaseditems_InsuranceNumber: string (nullable = true)
 |-- Type_Client_PaidIn_purchaseditems_Insuranceprice: long (nullable = true)
 |-- Type_Client_PaidIn_purchaseditems_Insurancequantity: long (nullable = true)
 |-- Type_Client_PaidIn_purchaseditems_childItems_InsuranceLabel: string (nullable = true)
 |-- Type_Client_PaidIn_purchaseditems_childItems_InsuranceNumber: string (nullable = true)
 |-- Type_Client_PaidIn_purchaseditems_childItems_Insuranceprice: long (nullable = true)
 |-- Type_Client_PaidIn_purchaseditems_childItems_Insurancequantity: long (nullable = true)
 |-- Type_Client_PaidIn_purchaseditems_discountsreceived_amount: long (nullable = true)
 |-- Type_Client_PaidIn_purchaseditems_discountsreceived_description: string (nullable = true)
 |-- Type_Client_PaidIn_purchaseditems_childItems_discountsreceived_amount: long (nullable = true)
 |-- Type_Client_PaidIn_purchaseditems_childItems_discountsreceived_description: string (nullable = true)

Flatten Complex Nested JSON (PYSPARK)

Question

1 answers

solution1
0 2022-09-09 18:42:14

Flatten Complex Nested JSON (PYSPARK)

Question

1 answers

solution1 0 2022-09-09 18:42:14

solution1
0 2022-09-09 18:42:14