I am trying to flatten the below-nested JSON:
root
|-- id: string (nullable = true)
|-- InsuranceProvider: string (nullable = true)
|-- Type: struct (nullable = true)
| |-- Client: struct (nullable = true)
| | |-- PaidIn: struct (nullable = true)
| | | |-- Insuranceid: string (nullable = true)
| | | |-- Insurancedesc: string (nullable = true)
| | | |-- purchaseditems: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- InsuranceNumber: string (nullable = true)
| | | | | |-- InsuranceLabel: string (nullable = true)
| | | | | |-- Insurancequantity: double (nullable = true)
| | | | | |-- Insuranceprice: integer (nullable = true)
| | | | | |-- discountsreceived: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- amount: integer (nullable = true)
| | | | | | | |-- description: string (nullable = true)
| | | | | |-- childItems: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- InsuranceNumber: string (nullable = true)
| | | | | | | |-- InsuranceLabel: string (nullable = true)
| | | | | | | |-- Insurancequantity: double (nullable = true)
| | | | | | | |-- Insuranceprice: integer (nullable = true)
| | | | | | | |-- discountsreceived: array (nullable = true)
| | | | | | | | |-- element: struct (containsNull = true)
| | | | | | | | | |-- amount: integer (nullable = true)
| | | | | | | | | |-- description: string (nullable = true)
|-- eventTime: string (nullable = true)
I am trying to manually flatten it by exploding each array type and selecting the column required.
Below is my code:
df1 = purchaseDF.select(#col("InsuranceProvider"),
col("eventTime"),
col("id"),
col("Type.Client.PaidIn.Insurancedesc").alias("Insurancedesc"),
col("Type.Client.PaidIn.Insuranceid").alias("Insuranceid"),
explode("Type.Client.PaidIn.purchaseditems").alias("purchaseditemsArray")
)#.show(truncate=True)
df2 = df1.select("*",
col("purchaseditemsArray.InsuranceLabel").alias("InsuranceLabel"),
col("purchaseditemsArray.InsuranceNumber").alias("InsuranceNumber"),
col("purchaseditemsArray.Insuranceprice").alias("Insuranceprice"),
col("purchaseditemsArray.Insurancequantity").alias("Insurancequantity"),
explode("purchaseditemsArray.childItems").alias("childItemsArray")
)#.drop("purchaseditemsArray")
#.show(truncate=False)
df3 = df2.select("*",
col("childItemsArray.InsuranceLabel").alias("ChildInsuranceLabel"),
col("childItemsArray.InsuranceNumber").alias("ChildInsuranceNumber"),
col("childItemsArray.Insuranceprice").alias("ChildInsuranceprice"),
col("childItemsArray.Insurancequantity").alias("ChildInsurancequantity"),
explode("childItemsArray.discountsreceived").alias("discountsreceivedArray")
).drop("childItemsArray")
#.show(truncate=False)
df4 = df3.select("*",
col("discountsreceivedArray.amount").alias("ChildItemDiscountAmount"),
col("discountsreceivedArray.description").alias("ChildItemDiscountDescription")
).drop("discountsreceivedArray")
#.show(truncate=False)
discountReceivedDF = df4.select("*",
explode("purchaseditemsArray.discountsreceived").alias("discountsreceivedArray")
)
finalDF = discountReceivedDF.select("*",
"discountsreceivedArray.amount",
"discountsreceivedArray.description"
).drop("purchaseditemsArray").drop("discountsreceivedArray")
After writing the above I got all the columns separately. I tried the below to show the result:
finalDF.show(truncate= False)
finalDF.printSchema()
OUTPUT:
root
|-- eventTime: string (nullable = true)
|-- id: string (nullable = true)
|-- Insurancedesc: string (nullable = true)
|-- Insuranceid: string (nullable = true)
|-- InsuranceLabel: string (nullable = true)
|-- InsuranceNumber: string (nullable = true)
|-- Insuranceprice: integer (nullable = true)
|-- Insurancequantity: double (nullable = true)
|-- ChildInsuranceLabel: string (nullable = true)
|-- ChildInsuranceNumber: string (nullable = true)
|-- ChildInsuranceprice: integer (nullable = true)
|-- ChildInsurancequantity: double (nullable = true)
|-- ChildItemDiscountAmount: integer (nullable = true)
|-- ChildItemDiscountDescription: string (nullable = true)
|-- amount: integer (nullable = true)
|-- description: string (nullable = true)
My question is , Is there any alternative where I can optimize this nested JSON flattening thing? I looked up many blogs but didn't understand anything.
Below is the sample JSON Data:
[
{
"id": "164651478631223455788978942317",
"InsuranceProvider": "Embroker",
"Type": {
"Client": {
"PaidIn": {
"Insuranceid": "97331549875122744335422",
"Insurancedesc": "Magic happens here",
"purchaseditems": [
{
"InsuranceNumber": "1",
"InsuranceLabel": "DNO",
"Insurancequantity": 1,
"Insuranceprice": 345,
"discountsreceived": [
{
"amount": 495,
"description": "Item 1, Discount 1"
}
],
"childItems": [
{
"InsuranceNumber": "1",
"InsuranceLabel": "Cyber",
"Insurancequantity": 1,
"Insuranceprice": 0,
"discountsreceived": [
{
"amount": 2495,
"description": "Insurance item 1, Discount 1"
}
]
}
]
},
{
"InsuranceNumber": "2",
"InsuranceLabel": "LPL",
"Insurancequantity": 2,
"Insuranceprice": 945,
"discountsreceived": [
{
"amount": -295,
"description": "Item 2, Discount 1"
}
],
"childItems": [
{
"InsuranceNumber": "1",
"InsuranceLabel": "Cyber",
"Insurancequantity": 1,
"Insuranceprice": 0,
"discountsreceived": [
{
"amount": 495,
"description": "Insurance item 2, Discount 1"
}
]
}
]
},
{
"InsuranceNumber": "3",
"InsuranceLabel": "LPL",
"Insurancequantity": 2,
"Insuranceprice": 945,
"discountsreceived": [
{
"amount": 295,
"description": "Item 2, Discount 1"
}
],
"childItems": [
{
"InsuranceNumber": "1",
"InsuranceLabel": "Cyber",
"Insurancequantity": 1,
"Insuranceprice": 0,
"discountsreceived": [
{
"amount": 400,
"description": "Insurance item 2, Discount 1"
}
]
}
]
},
{
"InsuranceNumber": "4",
"InsuranceLabel": "LPL",
"Insurancequantity": 2,
"Insuranceprice": 945,
"discountsreceived": [
{
"amount": 295,
"description": "Item 2, Discount 1"
}
],
"childItems": [
{
"InsuranceNumber": "1",
"InsuranceLabel": "Cyber",
"Insurancequantity": 1,
"Insuranceprice": 0,
"discountsreceived": [
{
"amount": 335,
"description": "Insurance item 2, Discount 1"
}
]
}
]
},
{
"InsuranceNumber": "5",
"InsuranceLabel": "Employment Practices Liability",
"Insurancequantity": 2,
"Insuranceprice": 945,
"discountsreceived": [
{
"amount": 1295,
"description": "Item 2, Discount 1"
}
],
"childItems": [
{
"InsuranceNumber": "1",
"InsuranceLabel": "Cyber",
"Insurancequantity": 1,
"Insuranceprice": 0,
"discountsreceived": [
{
"amount": 195,
"description": "Insurance item 2, Discount 1"
}
]
}
]
},
{
"InsuranceNumber": "6",
"InsuranceLabel": "Employment Practices Liability",
"Insurancequantity": 2,
"Insuranceprice": 945,
"discountsreceived": [
{
"amount": 805,
"description": "Item 2, Discount 1"
}
],
"childItems": [
{
"InsuranceNumber": "1",
"InsuranceLabel": "Cyber",
"Insurancequantity": 1,
"Insuranceprice": 0,
"discountsreceived": [
{
"amount": 501,
"description": "Insurance item 2, Discount 1"
}
]
}
]
},
{
"InsuranceNumber": "7",
"InsuranceLabel": "Employment Practices Liability",
"Insurancequantity": 2,
"Insuranceprice": 945,
"discountsreceived": [
{
"amount": 521,
"description": "Item 2, Discount 1"
}
],
"childItems": [
{
"InsuranceNumber": "1",
"InsuranceLabel": "Cyber",
"Insurancequantity": 1,
"Insuranceprice": 0,
"discountsreceived": [
{
"amount": 533,
"description": "Insurance item 2, Discount 1"
}
]
}
]
},
{
"InsuranceNumber": "8",
"InsuranceLabel": "Employment Practices Liability",
"Insurancequantity": 2,
"Insuranceprice": 945,
"discountsreceived": [
{
"amount": 422,
"description": "Item 2, Discount 1"
}
],
"childItems": [
{
"InsuranceNumber": "1",
"InsuranceLabel": "Cyber",
"Insurancequantity": 1,
"Insuranceprice": 0,
"discountsreceived": [
{
"amount": 333,
"description": "Insurance item 2, Discount 1"
}
]
}
]
},
{
"InsuranceNumber": "9",
"InsuranceLabel": "Employment Practices Liability",
"Insurancequantity": 2,
"Insuranceprice": 945,
"discountsreceived": [
{
"amount": 444,
"description": "Item 2, Discount 1"
}
],
"childItems": [
{
"InsuranceNumber": "1",
"InsuranceLabel": "Cyber",
"Insurancequantity": 1,
"Insuranceprice": 0,
"discountsreceived": [
{
"amount": 666,
"description": "Insurance item 2, Discount 1"
}
]
}
]
},
{
"InsuranceNumber": "10",
"InsuranceLabel": "DNO",
"Insurancequantity": 2,
"Insuranceprice": 945,
"discountsreceived": [
{
"amount": 10,
"description": "Item 2, Discount 1"
}
],
"childItems": [
{
"InsuranceNumber": "1",
"InsuranceLabel": "Cyber",
"Insurancequantity": 1,
"Insuranceprice": 0,
"discountsreceived": [
{
"amount": 63,
"description": "Insurance item 2, Discount 1"
}
]
}
]
}
]
}
}
},
"eventTime": "2020-05-19T01:59:10.379Z"
}
]
Thank you so much in advance:)
df = spark.read.json("test.json", multiLine=True)
from pyspark.sql.functions import *
from pyspark.sql.types import *
def flatten_test(df, sep="_"):
"""Returns a flattened dataframe.
.. versionadded:: x.X.X
Parameters
----------
sep : str
Delimiter for flatted columns. Default `_`
Notes
-----
Don`t use `.` as `sep`
It won't work on nested data frames with more than one level.
And you will have to use `columns.name`.
Flattening Map Types will have to find every key in the column.
This can be slow.
Examples
--------
data_mixed = [
{
"state": "Florida",
"shortname": "FL",
"info": {"governor": "Rick Scott"},
"counties": [
{"name": "Dade", "population": 12345},
{"name": "Broward", "population": 40000},
{"name": "Palm Beach", "population": 60000},
],
},
{
"state": "Ohio",
"shortname": "OH",
"info": {"governor": "John Kasich"},
"counties": [
{"name": "Summit", "population": 1234},
{"name": "Cuyahoga", "population": 1337},
],
},
]
data_mixed = spark.createDataFrame(data=data_mixed)
data_mixed.printSchema()
root
|-- counties: array (nullable = true)
| |-- element: map (containsNull = true)
| | |-- key: string
| | |-- value: string (valueContainsNull = true)
|-- info: map (nullable = true)
| |-- key: string
| |-- value: string (valueContainsNull = true)
|-- shortname: string (nullable = true)
|-- state: string (nullable = true)
data_mixed_flat = flatten_test(df, sep=":")
data_mixed_flat.printSchema()
root
|-- shortname: string (nullable = true)
|-- state: string (nullable = true)
|-- counties:name: string (nullable = true)
|-- counties:population: string (nullable = true)
|-- info:governor: string (nullable = true)
data = [
{
"id": 1,
"name": "Cole Volk",
"fitness": {"height": 130, "weight": 60},
},
{"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
{
"id": 2,
"name": "Faye Raker",
"fitness": {"height": 130, "weight": 60},
},
]
df = spark.createDataFrame(data=data)
df.printSchema()
root
|-- fitness: map (nullable = true)
| |-- key: string
| |-- value: long (valueContainsNull = true)
|-- id: long (nullable = true)
|-- name: string (nullable = true)
df_flat = flatten_test(df, sep=":")
df_flat.printSchema()
root
|-- id: long (nullable = true)
|-- name: string (nullable = true)
|-- fitness:height: long (nullable = true)
|-- fitness:weight: long (nullable = true)
data_struct = [
(("James",None,"Smith"),"OH","M"),
(("Anna","Rose",""),"NY","F"),
(("Julia","","Williams"),"OH","F"),
(("Maria","Anne","Jones"),"NY","M"),
(("Jen","Mary","Brown"),"NY","M"),
(("Mike","Mary","Williams"),"OH","M")
]
schema = StructType([
StructField('name', StructType([
StructField('firstname', StringType(), True),
StructField('middlename', StringType(), True),
StructField('lastname', StringType(), True)
])),
StructField('state', StringType(), True),
StructField('gender', StringType(), True)
])
df_struct = spark.createDataFrame(data = data_struct, schema = schema)
df_struct.printSchema()
root
|-- name: struct (nullable = true)
| |-- firstname: string (nullable = true)
| |-- middlename: string (nullable = true)
| |-- lastname: string (nullable = true)
|-- state: string (nullable = true)
|-- gender: string (nullable = true)
df_struct_flat = flatten_test(df_struct, sep=":")
df_struct_flat.printSchema()
root
|-- state: string (nullable = true)
|-- gender: string (nullable = true)
|-- name:firstname: string (nullable = true)
|-- name:middlename: string (nullable = true)
|-- name:lastname: string (nullable = true)
"""
# compute Complex Fields (Arrays, Structs and Maptypes) in Schema
complex_fields = dict(
[
(field.name, field.dataType)
for field in df.schema.fields
if type(field.dataType) == ArrayType
or type(field.dataType) == StructType
or type(field.dataType) == MapType
]
)
while len(complex_fields) != 0:
col_name = list(complex_fields.keys())[0]
# print ("Processing :"+col_name+" Type : "+str(type(complex_fields[col_name])))
# if StructType then convert all sub element to columns.
# i.e. flatten structs
if type(complex_fields[col_name]) == StructType:
expanded = [
col(col_name + "." + k).alias(col_name + sep + k)
for k in [n.name for n in complex_fields[col_name]]
]
df = df.select("*", *expanded).drop(col_name)
# if ArrayType then add the Array Elements as Rows using the explode function
# i.e. explode Arrays
elif type(complex_fields[col_name]) == ArrayType:
df = df.withColumn(col_name, explode_outer(col_name))
# if MapType then convert all sub element to columns.
# i.e. flatten
elif type(complex_fields[col_name]) == MapType:
keys_df = df.select(explode_outer(map_keys(col(col_name)))).distinct()
keys = list(map(lambda row: row[0], keys_df.collect()))
key_cols = list(
map(
lambda f: col(col_name).getItem(f).alias(str(col_name + sep + f)),
keys,
)
)
drop_column_list = [col_name]
df = df.select(
[
col_name
for col_name in df.columns
if col_name not in drop_column_list
]
+ key_cols
)
# recompute remaining Complex Fields in Schema
complex_fields = dict(
[
(field.name, field.dataType)
for field in df.schema.fields
if type(field.dataType) == ArrayType
or type(field.dataType) == StructType
or type(field.dataType) == MapType
]
)
return df
df_flat = flatten_test(df)
df_flat.printSchema()
root
|-- InsuranceProvider: string (nullable = true)
|-- eventTime: string (nullable = true)
|-- id: string (nullable = true)
|-- Type_Client_PaidIn_Insurancedesc: string (nullable = true)
|-- Type_Client_PaidIn_Insuranceid: string (nullable = true)
|-- Type_Client_PaidIn_purchaseditems_InsuranceLabel: string (nullable = true)
|-- Type_Client_PaidIn_purchaseditems_InsuranceNumber: string (nullable = true)
|-- Type_Client_PaidIn_purchaseditems_Insuranceprice: long (nullable = true)
|-- Type_Client_PaidIn_purchaseditems_Insurancequantity: long (nullable = true)
|-- Type_Client_PaidIn_purchaseditems_childItems_InsuranceLabel: string (nullable = true)
|-- Type_Client_PaidIn_purchaseditems_childItems_InsuranceNumber: string (nullable = true)
|-- Type_Client_PaidIn_purchaseditems_childItems_Insuranceprice: long (nullable = true)
|-- Type_Client_PaidIn_purchaseditems_childItems_Insurancequantity: long (nullable = true)
|-- Type_Client_PaidIn_purchaseditems_discountsreceived_amount: long (nullable = true)
|-- Type_Client_PaidIn_purchaseditems_discountsreceived_description: string (nullable = true)
|-- Type_Client_PaidIn_purchaseditems_childItems_discountsreceived_amount: long (nullable = true)
|-- Type_Client_PaidIn_purchaseditems_childItems_discountsreceived_description: string (nullable = true)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.