![](/img/trans.png)
[英]Can I transform a complex json object to multiple rows in a dataframe in Azure Databricks using pyspark?
[英]transform a complex json object from a web api to multiple rows in a dataframe in Azure Databricks using pyspark?
我有一个从 REST API 接收的 JSON 文件。 返回的一个例子是这样的:
{
"d": {
"results": [
{
"__metadata": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')",
"type": "EmpEmployment"
},
"personIdExternal": "60000033",
"userId": "60000033",
"hiringNotCompleted": false,
"isECRecord": true,
"lastModifiedDateTime": "/Date(1642917586000+0000)/",
"endDate": "/Date(1675123200000)/",
"createdDateTime": "/Date(1641473919000+0000)/",
"createdOn": "/Date(1641473919000)/",
"originalStartDate": "/Date(1501545600000)/",
"customDate1": "/Date(1501545600000)/",
"customString17": null,
"customString18": null,
"customString19": null,
"assignmentClass": "ST",
"lastModifiedBy": "This Dude",
"okToRehire": true,
"customString4": null,
"customString3": "3",
"customString2": null,
"assignmentIdExternal": "60000033",
"customString16": null,
"lastModifiedOn": "/Date(1642917586000)/",
"customString1": null,
"createdBy": "This Dudette",
"seniorityDate": "/Date(1501545600000)/",
"startDate": "/Date(1659398400000)/",
"customString16Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString16Nav"
}
},
"customString1Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString1Nav"
}
},
"customString18Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString18Nav"
}
},
"customString3Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString3Nav"
}
},
"paymentInformationNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/paymentInformationNav"
}
},
"empJobRelationshipNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/empJobRelationshipNav"
}
},
"personNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/personNav"
}
},
"empWorkPermitNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/empWorkPermitNav"
}
},
"photoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/photoNav"
}
},
"compInfoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/compInfoNav"
}
},
"userNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/userNav"
}
},
"customString2Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString2Nav"
}
},
"customString19Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/customString19Nav"
}
},
"jobInfoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/jobInfoNav"
}
},
"wfRequestNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/wfRequestNav"
}
},
"costDistributionNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/costDistributionNav"
}
},
"empPayCompNonRecurringNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='60000033',userId='60000033')/empPayCompNonRecurringNav"
}
}
},
{
"__metadata": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')",
"type": "EmpEmployment"
},
"personIdExternal": "100003",
"userId": "100003",
"hiringNotCompleted": false,
"isECRecord": true,
"lastModifiedDateTime": "/Date(1638051713000+0000)/",
"endDate": null,
"createdDateTime": "/Date(1638051713000+0000)/",
"createdOn": "/Date(1638051713000)/",
"originalStartDate": "/Date(1635724800000)/",
"customDate1": null,
"customString17": null,
"customString18": null,
"customString19": null,
"assignmentClass": "ST",
"lastModifiedBy": "This Dudette",
"okToRehire": null,
"customString4": null,
"customString3": null,
"customString2": null,
"assignmentIdExternal": "100003",
"customString16": null,
"lastModifiedOn": "/Date(1638051713000)/",
"customString1": null,
"createdBy": "This Dude",
"seniorityDate": "/Date(1635724800000)/",
"startDate": "/Date(1635724800000)/",
"customString16Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString16Nav"
}
},
"customString1Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString1Nav"
}
},
"customString18Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString18Nav"
}
},
"customString3Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString3Nav"
}
},
"paymentInformationNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/paymentInformationNav"
}
},
"empJobRelationshipNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/empJobRelationshipNav"
}
},
"personNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/personNav"
}
},
"empWorkPermitNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/empWorkPermitNav"
}
},
"photoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/photoNav"
}
},
"compInfoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/compInfoNav"
}
},
"userNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/userNav"
}
},
"customString2Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString2Nav"
}
},
"customString19Nav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/customString19Nav"
}
},
"jobInfoNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/jobInfoNav"
}
},
"wfRequestNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/wfRequestNav"
}
},
"costDistributionNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/costDistributionNav"
}
},
"empPayCompNonRecurringNav": {
"__deferred": {
"uri": "https://someapi.insomeplace.com/odata/v2/EmpEmployment(personIdExternal='100003',userId='100003')/empPayCompNonRecurringNav"
}
}
}
]
}
}
我目前只是想从 JSON 中提取 userId 和 startDate。 我已经尝试使用explode 命令,如本例所示。
https://adatis.co.uk/parsing-nested-json-lists-in-databricks-using-python/
但我似乎所做的就是能够将整个结果放入数据框中的一列,或者如果我使用以下内容:
请在正确的方向上提供一些帮助。 我是否更好地构建一个自定义模式并尝试将 JSON 解析为该模式?
我要做的就是根据图像返回结果,但每个 userId 和 startDate 都在自己的行上,因为它们彼此相关。
展开results
以将它们排成一行
df = spark.read.json("./sample.json", multiLine=True)
df2 = df.withColumn('d', explode(col('d.results')))
df2.select(df2.d.userId, df2.d.startDate).show(10,False)
+--------+---------------------+
|d.userId|d.startDate |
+--------+---------------------+
|60000033|/Date(1659398400000)/|
|100003 |/Date(1635724800000)/|
+--------+---------------------+
您可以根据需要添加任意数量的属性,例如
df.select(explode(col('d.results'))).\
selectExpr("col.userId","col.startDate","col.lastModifiedBy").\
show(10,False)
+--------+---------------------+--------------+
|userId |startDate |lastModifiedBy|
+--------+---------------------+--------------+
|60000033|/Date(1659398400000)/|This Dude |
|100003 |/Date(1635724800000)/|This Dudette |
+--------+---------------------+--------------+
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.