I am trying to load a json file to pyspark and the schema of the file is like below
root
|-- alertId: string (nullable = true)
|-- createdTime: string (nullable = true)
|-- customInSeconds: long (nullable = true)
|-- end: string (nullable = true)
|-- from: string (nullable = true)
|-- granularityString: string (nullable = true)
|-- groupId: string (nullable = true)
|-- groupName: string (nullable = true)
|-- hookIds: array (nullable = true)
| |-- element: string (containsNull = true)
|-- instanceId: string (nullable = true)
|-- instanceName: string (nullable = true)
|-- isResolved: boolean (nullable = true)
|-- score: double (nullable = true)
|-- severity: double (nullable = true)
|-- variableRank: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- dimension: struct (nullable = true)
| | | |-- tag_name: string (nullable = true)
| | |-- metricId: string (nullable = true)
| | |-- metricName: string (nullable = true)
| | |-- score: double (nullable = true)
| | |-- seriesId: string (nullable = true)
| | |-- severity: double (nullable = true)
and I tried to write a schema using structfield to parse the json file from eventhub but it keeps on failing.
schema= StructType([
StructField("alertId", StringType(), True),
StructField("createdTime", StringType(), True),
StructField("customInSeconds", LongType(), True),
StructField("end", StringType(), True),
StructField("from", StringType(), True),
StructField("granularityString", StringType(), True),
StructField("groupId", StringType(), True),
StructField("groupName", StringType(), True),
StructField("hookIds", ArrayType(StructField("element", StringType()))),
StructField("instanceId", StringType(), True),
StructField("instanceName", StringType(), True),
StructField("isResolved", StringType(), True),
StructField("score", DoubleType(), True),
StructField("severity", StringType(), True),
StructField("variableRank", ArrayType(
StructType([StructField("dimension",StructType([StructField("tag_name",StringType(),True)])),StructField("metricId",StringType(),True),StructField("metricName",StringType(),True),
StructField("score", DoubleType(),True),
StructField("seriesId", IntegerType(),True),
StructField("severity", DoubleType(),True)])))])
and I get an error that the data can't be parsed with the schema created. Could someone please fix the code to load the data successfully
The schema definition for the file schema you have listed would be
from pyspark.sql.types import ArrayType, DoubleType, StructType, StructField, \
StringType, LongType, BooleanType
schema = StructType([
StructField("alertId", StringType(), True),
StructField("createdTime", StringType(), True),
StructField("customInSeconds", LongType(), True),
StructField("end", StringType(), True),
StructField("from", StringType(), True),
StructField("granularityString", StringType(), True),
StructField("groupId", StringType(), True),
StructField("groupName", StringType(), True),
StructField("hookIds", ArrayType(StringType())),
StructField("instanceId", StringType(), True),
StructField("instanceName", StringType(), True),
StructField("isResolved", BooleanType(), True),
StructField("score", DoubleType(), True),
StructField("severity", DoubleType(), True),
StructField("variableRank", ArrayType(
StructType([StructField("dimension", StructType([StructField("tag_name", StringType(), True)])),
StructField("metricId", StringType(), True), StructField("metricName", StringType(), True),
StructField("score", DoubleType(), True),
StructField("seriesId", StringType(), True),
StructField("severity", DoubleType(), True)])))])
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, DoubleType, StructType, StructField, \
StringType, LongType, BooleanType
schema = StructType([
StructField("alertId", StringType(), True),
StructField("createdTime", StringType(), True),
StructField("customInSeconds", LongType(), True),
StructField("end", StringType(), True),
StructField("from", StringType(), True),
StructField("granularityString", StringType(), True),
StructField("groupId", StringType(), True),
StructField("groupName", StringType(), True),
StructField("hookIds", ArrayType(StringType())),
StructField("instanceId", StringType(), True),
StructField("instanceName", StringType(), True),
StructField("isResolved", BooleanType(), True),
StructField("score", DoubleType(), True),
StructField("severity", DoubleType(), True),
StructField("variableRank", ArrayType(
StructType([StructField("dimension", StructType([StructField("tag_name", StringType(), True)])),
StructField("metricId", StringType(), True), StructField("metricName", StringType(), True),
StructField("score", DoubleType(), True),
StructField("seriesId", StringType(), True),
StructField("severity", DoubleType(), True)])))])
data = '{"alertId": "1", "createdTime": "2021-12-25 12:00:00", "customInSeconds": 10000000, "end": "end_val", ' \
'"from": "from_val", "granularityString": "granularityString_val", "groupId": "groupId_val", "groupName": ' \
'"groupName_val", "hookIds": ["hookIds1", "hookIds2"], "instanceId": "instanceId_val", "instanceName": ' \
'"instanceName_val", "isResolved": true, "score": 10.0, "severity": 1.0, "variableRank": [{"dimension": {' \
'"tag_name": "tag_name_val"}, "metricId": "metricId_val", "metricName": "metricName_val", "score": 10.0, ' \
'"seriesId": "seriesId_val", "severity": 1.0}]}'
df = spark.createDataFrame([(data, ),], ("json_str", ))
df.withColumn("json_val", F.from_json("json_str", schema)).select("json_val.*").printSchema()
root
|-- alertId: string (nullable = true)
|-- createdTime: string (nullable = true)
|-- customInSeconds: long (nullable = true)
|-- end: string (nullable = true)
|-- from: string (nullable = true)
|-- granularityString: string (nullable = true)
|-- groupId: string (nullable = true)
|-- groupName: string (nullable = true)
|-- hookIds: array (nullable = true)
| |-- element: string (containsNull = true)
|-- instanceId: string (nullable = true)
|-- instanceName: string (nullable = true)
|-- isResolved: boolean (nullable = true)
|-- score: double (nullable = true)
|-- severity: double (nullable = true)
|-- variableRank: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- dimension: struct (nullable = true)
| | | |-- tag_name: string (nullable = true)
| | |-- metricId: string (nullable = true)
| | |-- metricName: string (nullable = true)
| | |-- score: double (nullable = true)
| | |-- seriesId: string (nullable = true)
| | |-- severity: double (nullable = true)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.