Utilizing python (version 3.7.12) and pyspark (version 2.4.0).
I am trying to use a from_json statement using the columns and identified schema. However, the df returns as null . I am assuming I am incorrectly identifying the schema and type for the columns.
The following code is the json string from a table I pulled from using get_json_object:
df = df.select(col('id'), get_json_object(col("pulled_col"), "$.data"))
df.head()
#Row(id = '0123456', data = '[
#{"time" : [], "history" : [], "zip" : "78910", "phnumber" : #"5678910123", "name" : "-"},
#{"time" : [], "history" : [], "zip" : "78920", "phnumber" : #"5678910123", "name" : "-"},
#{"time" : [], "history" : [], "zip" : "78930", "phnumber" : #"5678910123", "name" : "-"},
#{"time" : [], "history" : [], "zip" : "78910", "phnumber" : #"5678910123", "name" : "-"}
#]')
df.printSchema()
#root
# |-- id: string (nullable = true)
# |-- data: string (nullable = true)
df.show()
#+-------+----------------------------+
#| id| data|
#+-------+----------------------------+
#|0123456|[{"time" : [], "history"....|
#|0123456|[{"time" : [], "history"....|
#+-------+----------------------------+
test = df.select(col("id"), get_json_object(col("data"),"$.zip")\
.alias("zip"))\
.show(truncate=False)
# The output shouldn't be null?
#+-------+----+
#| id| zip|
#+-------+----+
#|0123456|null|
#|0123456|null|
#+-------+----+
schema = StructType(
[
StructField('zip', StringType(), True),
StructField('phnumber', StringType(), True),
StructField('name', StringType(), True)
]
)
data_json = df.withColumn("data", from_json("data", schema))\
.select(col('id'), col('data.*'))
# The df output shouldn't be null for the new json schema?
data_json.show()
#+-------+----+---------+-----+
#| id| zip| phnumber| name|
#+-------+----+---------+-----+
#|0123456|null| null| null|
#|0123456|null| null| null|
#+-------+----+---------+-----+
The data
column actually contains a json array so the schema must be an ArrayType
:
schema = ArrayType(
elementType = StructType(
[
StructField('zip', StringType(), True),
StructField('phnumber', StringType(), True),
StructField('name', StringType(), True)
]
)
)
data_json = df.withColumn("data", F.from_json("data", schema))
which results in the following schema:
root
|-- id: long (nullable = true)
|-- data: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- zip: string (nullable = true)
| | |-- phnumber: string (nullable = true)
| | |-- name: string (nullable = true)
Now if you want each element of the array in a separate row, you can explode it and extract the fields you need:
data_json = df.withColumn("data", F.from_json("data", schema)) \
.withColumn("data", F.explode("data")) \
.select(F.col('id'), F.col('data.*'))
Result:
+---+-----+----------+----+
| id| zip| phnumber|name|
+---+-----+----------+----+
| 1|78910|5678910123| -|
| 1|78920|5678910123| -|
+---+-----+----------+----+
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.