[英]Pyspark modify values of JSON fields without changing schema
我需要使用 pyspark 修改嵌套 JSON 的值並保持架構不變,架構應與原始 JSON 相同,只需修改少數字段的值。
下面是我的樣品 json:
我想修改以下字段的值:
源 JSON:
{
"references": [
{
"TAG1": VALUE1,
"TAG2": "VALUE2",
"TAG3": VALUE3,
"TAG4": "VALUE4",
"account": [
{
"ID": A_VALUE1,
"BANK_ID": A_VALUE2,
"ADDR1": "A_VALUE3",
"ADDR2": "A_VALUE4"
}
],
"holder": {
"ID": H_VALUE1,
"BANK_ID": H_VALUE2,
"ADDR1": "H_VALUE3",
"ADDR2": "H_VALUE4"
}
},
{
"TAG1": VALUE1,
"TAG2": "VALUE2",
"TAG3": VALUE3,
"TAG4": "VALUE4",
"account": [
{
"ID": A_VALUE1,
"BANK_ID": A_VALUE2,
"ADDR1": "A_VALUE3",
"ADDR2": "A_VALUE4"
}
],
"holder": {
"ID": H_VALUE1,
"BANK_ID": H_VALUE2,
"ADDR1": "H_VALUE3",
"ADDR2": "H_VALUE4"
}
}
]
}
OUTPUT JSON:
{
"references": [
{
"TAG1": NEW_VALUE1,
"TAG2": "NEW_VALUE2",
"TAG3": VALUE3,
"TAG4": "VALUE4",
"account": [
{
"ID": A_VALUE1,
"BANK_ID": A_VALUE2,
"ADDR1": "NEW_ADDR1",
"ADDR2": "NEW_ADDR2"
}
],
"holder": {
"ID": H_VALUE1,
"BANK_ID": H_VALUE2,
"ADDR1": "NEW_ADDR1",
"ADDR2": "NEW_ADDR2"
}
},
{
"TAG1": NEW_VALUE1,
"TAG2": "NEW_VALUE2",
"TAG3": VALUE3,
"TAG4": "VALUE4",
"account": [
{
"ID": A_VALUE1,
"BANK_ID": A_VALUE2,
"ADDR1": "NEW_ADDR1",
"ADDR2": "NEW_ADDR2"
}
],
"holder": {
"ID": H_VALUE1,
"BANK_ID": H_VALUE2,
"ADDR1": "NEW_ADDR1",
"ADDR2": "NEW_ADDR2"
}
}
]
}
您可以使用transform
function 來更新references
數組列的結構元素:
from pyspark.sql import functions as F
transform_expr = """
transform(references, x -> struct(
'NEW_VALUE1' as TAG1,
'NEW_VALUE2' as TAG2,
x.TAG3 as TAG3,
x.TAG4 as TAG4,
transform(x.account, i -> struct(
i.ID as ID,
i.BANK_ID as BANK_ID,
'NEW_ADDR1' as ADDR1,
'NEW_ADDR2' as ADDR2
)
) as account,
struct(
x.holder.ID AS ID,
x.holder.BANK_ID AS BANK_ID,
'NEW_ADDR1' AS ADDR1,
'NEW_ADDR2' AS ADDR2
) as holder
)
)
"""
df1 = df.withColumn("references", F.expr(transform_expr))
print(df1.toJSON().first())
# {"references":[{"TAG1":"NEW_VALUE1","TAG2":"NEW_VALUE2","TAG3":"VALUE3","TAG4":"VALUE4","account":[{"ID":"A_VALUE1","BANK_ID":"A_VALUE2","ADDR1":"NEW_ADDR1","ADDR2":"NEW_ADDR2"}],"holder":{"ID":"H_VALUE1","BANK_ID":"H_VALUE2","ADDR1":"NEW_ADDR1","ADDR2":"NEW_ADDR2"}},{"TAG1":"NEW_VALUE1","TAG2":"NEW_VALUE2","TAG3":"VALUE3","TAG4":"VALUE4","account":[{"ID":"A_VALUE1","BANK_ID":"A_VALUE2","ADDR1":"NEW_ADDR1","ADDR2":"NEW_ADDR2"}],"holder":{"ID":"H_VALUE1","BANK_ID":"H_VALUE2","ADDR1":"NEW_ADDR1","ADDR2":"NEW_ADDR2"}}]}
df1.write.json("...") # write output json
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.