简体   繁体   中英

pyspark how to create schema for array and struct to read json

I am trying to load a json file to pyspark and the schema of the file is like below

root
|-- alertId: string (nullable = true)
|-- createdTime: string (nullable = true)
|-- customInSeconds: long (nullable = true)
|-- end: string (nullable = true)
|-- from: string (nullable = true)
|-- granularityString: string (nullable = true)
|-- groupId: string (nullable = true)
|-- groupName: string (nullable = true)
|-- hookIds: array (nullable = true)
|    |-- element: string (containsNull = true)
|-- instanceId: string (nullable = true)
|-- instanceName: string (nullable = true)
|-- isResolved: boolean (nullable = true)
|-- score: double (nullable = true)
|-- severity: double (nullable = true)
|-- variableRank: array (nullable = true)
|    |-- element: struct (containsNull = true)
|    |    |-- dimension: struct (nullable = true)
|    |    |    |-- tag_name: string (nullable = true)
|    |    |-- metricId: string (nullable = true)
|    |    |-- metricName: string (nullable = true)
|    |    |-- score: double (nullable = true)
|    |    |-- seriesId: string (nullable = true)
|    |    |-- severity: double (nullable = true)

and I tried to write a schema using structfield to parse the json file from eventhub but it keeps on failing.

schema= StructType([
    StructField("alertId", StringType(), True),
    StructField("createdTime", StringType(), True),
    StructField("customInSeconds", LongType(), True),
    StructField("end", StringType(), True),
    StructField("from", StringType(), True),
    StructField("granularityString", StringType(), True),
    StructField("groupId", StringType(), True),
    StructField("groupName", StringType(), True),
    StructField("hookIds", ArrayType(StructField("element", StringType()))),
    StructField("instanceId", StringType(), True),
    StructField("instanceName", StringType(), True),
    StructField("isResolved", StringType(), True),
    StructField("score", DoubleType(), True),
    StructField("severity", StringType(), True),
    StructField("variableRank", ArrayType(
        StructType([StructField("dimension",StructType([StructField("tag_name",StringType(),True)])),StructField("metricId",StringType(),True),StructField("metricName",StringType(),True),
                  StructField("score", DoubleType(),True),
                  StructField("seriesId", IntegerType(),True),
                  StructField("severity", DoubleType(),True)])))])

and I get an error that the data can't be parsed with the schema created. Could someone please fix the code to load the data successfully

The schema definition for the file schema you have listed would be

from pyspark.sql.types import ArrayType, DoubleType, StructType, StructField, \
    StringType, LongType, BooleanType

schema = StructType([
    StructField("alertId", StringType(), True),
    StructField("createdTime", StringType(), True),
    StructField("customInSeconds", LongType(), True),
    StructField("end", StringType(), True),
    StructField("from", StringType(), True),
    StructField("granularityString", StringType(), True),
    StructField("groupId", StringType(), True),
    StructField("groupName", StringType(), True),
    StructField("hookIds", ArrayType(StringType())),
    StructField("instanceId", StringType(), True),
    StructField("instanceName", StringType(), True),
    StructField("isResolved", BooleanType(), True),
    StructField("score", DoubleType(), True),
    StructField("severity", DoubleType(), True),
    StructField("variableRank", ArrayType(
        StructType([StructField("dimension", StructType([StructField("tag_name", StringType(), True)])),
                    StructField("metricId", StringType(), True), StructField("metricName", StringType(), True),
                    StructField("score", DoubleType(), True),
                    StructField("seriesId", StringType(), True),
                    StructField("severity", DoubleType(), True)])))])

Working Example

from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, DoubleType, StructType, StructField, \
    StringType, LongType, BooleanType

schema = StructType([
    StructField("alertId", StringType(), True),
    StructField("createdTime", StringType(), True),
    StructField("customInSeconds", LongType(), True),
    StructField("end", StringType(), True),
    StructField("from", StringType(), True),
    StructField("granularityString", StringType(), True),
    StructField("groupId", StringType(), True),
    StructField("groupName", StringType(), True),
    StructField("hookIds", ArrayType(StringType())),
    StructField("instanceId", StringType(), True),
    StructField("instanceName", StringType(), True),
    StructField("isResolved", BooleanType(), True),
    StructField("score", DoubleType(), True),
    StructField("severity", DoubleType(), True),
    StructField("variableRank", ArrayType(
        StructType([StructField("dimension", StructType([StructField("tag_name", StringType(), True)])),
                    StructField("metricId", StringType(), True), StructField("metricName", StringType(), True),
                    StructField("score", DoubleType(), True),
                    StructField("seriesId", StringType(), True),
                    StructField("severity", DoubleType(), True)])))])

data = '{"alertId": "1", "createdTime": "2021-12-25 12:00:00", "customInSeconds": 10000000, "end": "end_val", ' \
       '"from": "from_val", "granularityString": "granularityString_val", "groupId": "groupId_val", "groupName": ' \
       '"groupName_val", "hookIds": ["hookIds1", "hookIds2"], "instanceId": "instanceId_val", "instanceName": ' \
       '"instanceName_val", "isResolved": true, "score": 10.0, "severity": 1.0, "variableRank": [{"dimension": {' \
       '"tag_name": "tag_name_val"}, "metricId": "metricId_val", "metricName": "metricName_val", "score": 10.0, ' \
       '"seriesId": "seriesId_val", "severity": 1.0}]}'

df = spark.createDataFrame([(data, ),], ("json_str", ))

df.withColumn("json_val", F.from_json("json_str", schema)).select("json_val.*").printSchema()

Output

root
 |-- alertId: string (nullable = true)
 |-- createdTime: string (nullable = true)
 |-- customInSeconds: long (nullable = true)
 |-- end: string (nullable = true)
 |-- from: string (nullable = true)
 |-- granularityString: string (nullable = true)
 |-- groupId: string (nullable = true)
 |-- groupName: string (nullable = true)
 |-- hookIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- instanceId: string (nullable = true)
 |-- instanceName: string (nullable = true)
 |-- isResolved: boolean (nullable = true)
 |-- score: double (nullable = true)
 |-- severity: double (nullable = true)
 |-- variableRank: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- dimension: struct (nullable = true)
 |    |    |    |-- tag_name: string (nullable = true)
 |    |    |-- metricId: string (nullable = true)
 |    |    |-- metricName: string (nullable = true)
 |    |    |-- score: double (nullable = true)
 |    |    |-- seriesId: string (nullable = true)
 |    |    |-- severity: double (nullable = true)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM