I have this nested JSON in which I'm exploding the data out of. I do this via spark.sql(), but how can I pull out elements that appear multiple times? I would like to have a row for each of those specific elements.
For instance, here is a sample of my JSON file:
{
"httpStatus": 200,
"httpStatusMessage": "success",
"timestamp": "2020-11-11T19:46:01",
"response": {
"header": {
"fleetId": 10006,
"fleetName": "Naples",
"date": "2020-11-04T00:00:00",
"gpsUnit": "dd",
"speedUnit": "mph",
"accelUnit": "g",
"distanceUnit": "miles"
},
"body": {
"dataProviders": [
{
"dataProviderId": 14,
"drivers": [
{
"driverRef": "b119",
"driverId": 1000611055,
"driverFirstName": "John",
"driverLastName": "Doedoe",
"totalDistance": 238.87,
"vehicles": [
{
"deviceRef": null,
"deviceId": 0,
"vehicleRef": "1LNHL9DKXFG607058",
"vehicleId": 1000621018,
"trips": [
{
"tripId": "2020-11-04-10006-1000616018-10",
"tripDuration": 93.42,
"tripDistanceTravelled": 60.93,
"averageSpeed": 39.1,
"tripStart": {
"longitude": -81.7546463012695,
"latitude": 26.5263614654541,
"heading": 0,
"speed": 0,
"mileage": 0,
"timestamp": "2020-11-04T15:54:11Z"
},
"tripStop": {
"longitude": -81.7561874389648,
"latitude": 26.5265655517578,
"heading": 0,
"speed": 0,
"mileage": 0,
"timestamp": "2020-11-04T17:27:36.063Z"
}
},
{
"tripId": "2020-11-04-10006-1000616018-11",
"tripDuration": 60.03,
"tripDistanceTravelled": 32.11,
"averageSpeed": 32.1,
"tripStart": {
"longitude": -81.7561874389648,
"latitude": 26.5265655517578,
"heading": 0,
"speed": 0,
"mileage": 0,
"timestamp": "2020-11-04T17:46:55.063Z"
},
"tripStop": {
"longitude": -81.7723388671875,
"latitude": 26.2240772247314,
"heading": 0,
"speed": 0,
"mileage": 0,
"timestamp": "2020-11-04T18:46:57Z"
}
},
{
"tripId": "2020-11-04-10006-1000616018-8",
"tripDuration": 49.77,
"tripDistanceTravelled": 25.56,
"averageSpeed": 30.8,
"tripStart": {
"longitude": -81.7721862792969,
"latitude": 26.2239742279053,
"heading": 0,
"speed": 5.6,
"mileage": 0,
"timestamp": "2020-11-04T14:29:41Z"
},
"tripStop": {
"longitude": -81.7538528442383,
"latitude": 26.5260028839111,
"heading": 0,
"speed": 5.6,
"mileage": 0,
"timestamp": "2020-11-04T15:19:27Z"
}
}
]
},
{
"deviceRef": null,
"deviceId": 0,
"vehicleRef": "1GNSCGKCXKR180027",
"vehicleId": 1000621026,
"trips": [
{
"tripId": "2020-11-04-10006-1000616026-2",
"tripDuration": 59.95,
"tripDistanceTravelled": 30.41,
"averageSpeed": 30.4,
"tripStart": {
"longitude": -81.754524230957,
"latitude": 26.5265655517578,
"heading": 0,
"speed": 0,
"mileage": 0,
"timestamp": "2020-11-04T01:23:54.063Z"
},
"tripStop": {
"longitude": -81.7721328735352,
"latitude": 26.2240772247314,
"heading": 0,
"speed": 0,
"mileage": 0,
"timestamp": "2020-11-04T02:23:51.063Z"
}
}
]
}
]
}
]
}
]
}
}
}
There is three instances of trips and within trips there are three instances of tripId, tripDuration, etc...
As I explode the data out I'm starting to get all the data in an array and I need to pull that out properly but not sure how to pull out that data and store it in its own row, matching it with the parent data.
Here is my sample code:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
json_df = spark.read.json('/user/myuser/json_directory/sample.json')
json_df.printSchema()
json_df.createOrReplaceTempView('test_dictionary')
spark.sql('''select * from test_dictionary''').show()
spark.sql('''select
httpStatus
, httpStatusMessage
, test.dataProviderId
, tmp1.driverFirstName
, tmp1.driverId
, tmp1.driverLastName
, tmp1.driverRef
, tmp1.totalDistance
, tmp2.deviceId
, tmp2.vehicleId
, tmp2.vehicleRef
, tmp3.averageSpeed
, tmp3.tripDistanceTravelled
, tmp3.tripDuration
, tmp3.tripId
from test_dictionary
lateral view outer explode (response.body.dataProviders)a as test
lateral view outer explode (response.body.dataProviders.drivers)b as tmp1
lateral view outer explode (tmp1.vehicles)c as tmp2
lateral view outer explode (tmp2.trips)d as tmp3''').show()
results look like so:
root
|-- httpStatus: long (nullable = true)
|-- httpStatusMessage: string (nullable = true)
|-- response: struct (nullable = true)
| |-- body: struct (nullable = true)
| | |-- dataProviders: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- dataProviderId: long (nullable = true)
| | | | |-- drivers: array (nullable = true)
| | | | | |-- element: struct (containsNull = true)
| | | | | | |-- driverFirstName: string (nullable = true)
| | | | | | |-- driverId: long (nullable = true)
| | | | | | |-- driverLastName: string (nullable = true)
| | | | | | |-- driverRef: string (nullable = true)
| | | | | | |-- totalDistance: double (nullable = true)
| | | | | | |-- vehicles: array (nullable = true)
| | | | | | | |-- element: struct (containsNull = true)
| | | | | | | | |-- deviceId: long (nullable = true)
| | | | | | | | |-- deviceRef: string (nullable = true)
| | | | | | | | |-- trips: array (nullable = true)
| | | | | | | | | |-- element: struct (containsNull = true)
| | | | | | | | | | |-- averageSpeed: double (nullable = true)
| | | | | | | | | | |-- tripDistanceTravelled: double (nullable = true)
| | | | | | | | | | |-- tripDuration: double (nullable = true)
| | | | | | | | | | |-- tripId: string (nullable = true)
| | | | | | | | | | |-- tripStart: struct (nullable = true)
| | | | | | | | | | | |-- heading: double (nullable = true)
| | | | | | | | | | | |-- latitude: double (nullable = true)
| | | | | | | | | | | |-- longitude: double (nullable = true)
| | | | | | | | | | | |-- mileage: double (nullable = true)
| | | | | | | | | | | |-- speed: double (nullable = true)
| | | | | | | | | | | |-- timestamp: string (nullable = true)
| | | | | | | | | | |-- tripStop: struct (nullable = true)
| | | | | | | | | | | |-- heading: double (nullable = true)
| | | | | | | | | | | |-- latitude: double (nullable = true)
| | | | | | | | | | | |-- longitude: double (nullable = true)
| | | | | | | | | | | |-- mileage: double (nullable = true)
| | | | | | | | | | | |-- speed: double (nullable = true)
| | | | | | | | | | | |-- timestamp: string (nullable = true)
| | | | | | | | |-- vehicleId: long (nullable = true)
| | | | | | | | |-- vehicleRef: string (nullable = true)
| |-- header: struct (nullable = true)
| | |-- accelUnit: string (nullable = true)
| | |-- date: string (nullable = true)
| | |-- distanceUnit: string (nullable = true)
| | |-- fleetId: long (nullable = true)
| | |-- fleetName: string (nullable = true)
| | |-- gpsUnit: string (nullable = true)
| | |-- speedUnit: string (nullable = true)
|-- timestamp: string (nullable = true)
+----------+-----------------+--------------------+-------------------+
|httpStatus|httpStatusMessage| response| timestamp|
+----------+-----------------+--------------------+-------------------+
| 200| success|[[[[14, [[Eric, 1...|2020-11-11T19:46:01|
+----------+-----------------+--------------------+-------------------+
+----------+-----------------+--------------+---------------+------------+--------------+---------+-------------+--------+--------------------+--------------------+------------------+---------------------+--------------------+--------------------+
|httpStatus|httpStatusMessage|dataProviderId|driverFirstName| driverId|driverLastName|driverRef|totalDistance|deviceId| vehicleId| vehicleRef| averageSpeed|tripDistanceTravelled| tripDuration| tripId|
+----------+-----------------+--------------+---------------+------------+--------------+---------+-------------+--------+--------------------+--------------------+------------------+---------------------+--------------------+--------------------+
| 200| success| 14| [John]|[1000611055]| [Doedoe]| [b119]| [238.87]| [0, 0]|[1000621018, 1000...|[1LNHL9DKXFG60705...|[39.1, 32.1, 30.8]| [60.93, 32.11, 25...|[93.42, 60.03, 49...|[2020-11-04-10006...|
| 200| success| 14| [John]|[1000611055]| [Doedoe]| [b119]| [238.87]| [0, 0]|[1000621018, 1000...|[1LNHL9DKXFG60705...| [30.4]| [30.41]| [59.95]|[2020-11-04-10006...|
+----------+-----------------+--------------+---------------+------------+--------------+---------+-------------+--------+--------------------+--------------------+------------------+---------------------+--------------------+--------------------+
How do I get my results to look like so?:
httpStatus| httpStatusMessage| dataProviderId| driverFirstName driverId driverLastName driverRef| totalDistance| deviceId vehicleId vehicleRef averageSpeed tripDistanceTravelled tripDuration tripId tripStartLongitude latitude heading speed mileage timestamp tripstoplongitude latitude heading speed mileage timestamp
200 success 14 John 1000611055 Doedoe b119 238.87 0 1000621018 1LNHL9DKXFG607058 39.1 60.93 93.42 2020-11-04-10006-1000616018-10" -81 26 0 0 0 2020-11-04T15:54:11Z -81.75618744 26.52656555 0 0 0 2020-11-04T17:27:36.063Z
200 success 14 John 1000611055 Doedoe b119 238.87 0 1000621018 1LNHL9DKXFG607058 32.1 32.11 60.03 2020-11-04-10006-1000616018-11 -81.7 26.5 0 0 0 2020-11-04T17:46:55.063Z -81.77 26.22 0 0 0 2020-11-04T18:46:57Z
200 success 14 John 1000611055 Doedoe b119 238.87 0 1000621018 1LNHL9DKXFG607058 30.8 25.56 49.77 2020-11-04-10006-1000616018-8" -81.7 26.2 0 5.6 0 2020-11-04T14:29:41Z -81.74567899 26.52098 0 5.6 0 2020-11-04T15:19:27Z
200 success 14 John 1000611055 Doedoe b119 238.87 0 1000621026 1GNSCGKCXKR180027 30.4 30.41 59.95 2020-11-04-10006-1000616026-2 -81.7 26.5265 0 0 0 2020-11-04T01:23:54.063Z" -81.7721328 26.22407722 0 0 0 2020-11-04T02:23:51.063Z
Check below SQL
.
Explode all Array
type columns.
spark.sql('''
select
httpStatus,
httpStatusMessage,
dataProviders.dataProviderId,
drivers.driverFirstName,
drivers.driverId,
drivers.driverLastName,
drivers.driverRef,
drivers.totalDistance,
vehicles.deviceId,
vehicles.vehicleId,
vehicles.vehicleRef,
trips.averageSpeed,
trips.tripDistanceTravelled,
trips.tripDuration,
trips.tripId,
trips.tripStart.longitude as tripStart_longitude, // Added tripStart as prefix because tripStart & tripStop has same columns.
trips.tripStart.latitude as tripStart_latitude, // Added tripStart as prefix because tripStart & tripStop has same columns.
trips.tripStart.heading as tripStart_heading, // Added tripStart as prefix because tripStart & tripStop has same columns.
trips.tripStart.speed as tripStart_speed, // Added tripStart as prefix because tripStart & tripStop has same columns.
trips.tripStart.mileage as tripStart_mileage, // Added tripStart as prefix because tripStart & tripStop has same columns.
trips.tripStart.timestamp as tripStart_timestamp, // Added tripStart as prefix because tripStart & tripStop has same columns.
trips.tripStop.longitude as tripStop_longitude, // Added tripStop as prefix because tripStart & tripStop has same columns.
trips.tripStop.latitude as tripStop_latitude, // Added tripStop as prefix because tripStart & tripStop has same columns.
trips.tripStop.heading as tripStop_heading, // Added tripStop as prefix because tripStart & tripStop has same columns.
trips.tripStop.speed as tripStop_speed, // Added tripStop as prefix because tripStart & tripStop has same columns.
trips.tripStop.mileage as tripStop_mileage, // Added tripStop as prefix because tripStart & tripStop has same columns.
trips.tripStop.timestamp as tripStop_timestamp // Added tripStop as prefix because tripStart & tripStop has same columns.
from test_dictionary
lateral view outer explode (response.body.dataProviders) tbl_dataProviders as dataProviders // for dataProviders details
lateral view outer explode (dataProviders.drivers) dataProviders_drivers as drivers // for drivers details
lateral view outer explode (drivers.vehicles) drivers_vehicles as vehicles // for vehicles details
lateral view outer explode (vehicles.trips) vehicles_trips as trips // for trips details
''').show(false)
Final Output
+----------+-----------------+--------------+---------------+----------+--------------+---------+-------------+--------+----------+-----------------+------------+---------------------+------------+------------------------------+-------------------+------------------+-----------------+---------------+-----------------+------------------------+------------------+-----------------+----------------+--------------+----------------+------------------------+
|httpStatus|httpStatusMessage|dataProviderId|driverFirstName|driverId |driverLastName|driverRef|totalDistance|deviceId|vehicleId |vehicleRef |averageSpeed|tripDistanceTravelled|tripDuration|tripId |tripStart_longitude|tripStart_latitude|tripStart_heading|tripStart_speed|tripStart_mileage|tripStart_timestamp |tripStop_longitude|tripStop_latitude|tripStop_heading|tripStop_speed|tripStop_mileage|tripStop_timestamp |
+----------+-----------------+--------------+---------------+----------+--------------+---------+-------------+--------+----------+-----------------+------------+---------------------+------------+------------------------------+-------------------+------------------+-----------------+---------------+-----------------+------------------------+------------------+-----------------+----------------+--------------+----------------+------------------------+
|200 |success |14 |John |1000611055|Doedoe |b119 |238.87 |0 |1000621018|1LNHL9DKXFG607058|39.1 |60.93 |93.42 |2020-11-04-10006-1000616018-10|-81.7546463012695 |26.5263614654541 |0 |0.0 |0 |2020-11-04T15:54:11Z |-81.7561874389648 |26.5265655517578 |0 |0.0 |0 |2020-11-04T17:27:36.063Z|
|200 |success |14 |John |1000611055|Doedoe |b119 |238.87 |0 |1000621018|1LNHL9DKXFG607058|32.1 |32.11 |60.03 |2020-11-04-10006-1000616018-11|-81.7561874389648 |26.5265655517578 |0 |0.0 |0 |2020-11-04T17:46:55.063Z|-81.7723388671875 |26.2240772247314 |0 |0.0 |0 |2020-11-04T18:46:57Z |
|200 |success |14 |John |1000611055|Doedoe |b119 |238.87 |0 |1000621018|1LNHL9DKXFG607058|30.8 |25.56 |49.77 |2020-11-04-10006-1000616018-8 |-81.7721862792969 |26.2239742279053 |0 |5.6 |0 |2020-11-04T14:29:41Z |-81.7538528442383 |26.5260028839111 |0 |5.6 |0 |2020-11-04T15:19:27Z |
|200 |success |14 |John |1000611055|Doedoe |b119 |238.87 |0 |1000621026|1GNSCGKCXKR180027|30.4 |30.41 |59.95 |2020-11-04-10006-1000616026-2 |-81.754524230957 |26.5265655517578 |0 |0.0 |0 |2020-11-04T01:23:54.063Z|-81.7721328735352 |26.2240772247314 |0 |0.0 |0 |2020-11-04T02:23:51.063Z|
+----------+-----------------+--------------+---------------+----------+--------------+---------+-------------+--------+----------+-----------------+------------+---------------------+------------+------------------------------+-------------------+------------------+-----------------+---------------+-----------------+------------------------+------------------+-----------------+----------------+--------------+----------------+------------------------+
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.