简体   繁体   中英

Multilevel Complex Nested Json Using Spark SQL

I have a specific requirement to convert some related tables data in nested json like below by using Spark SQL. I have achieved it with Scala but not getting it resolved in Spark SQL.

{
"REPORTING_CARRIER":"9E", "DISTANCE":"3132", "ORIGIN_STATE_NM":"Pennsylvania", "QUARTER":"2", "YEAR":"2017", "ITIN_GEO_TYPE":"2", "BULK_FARE":"0", "ORIGIN":"ABE", "ORIGIN_AIRPORT_ID":"10135", "ITIN_FARE":"787", "ORIGIN_CITY_MARKET_ID":"30135", "ROUNDTRIP":"1", "Market":[
{
"MKT_DISTANCE":"1566", "MKT_BULK_FARE":"0", "MKT_NO_OF_CPNS":"2", "MKT_DEST_STATE_NM":"Texas", "MKT_OP_CARR_GRP":"9E:DL", "MKT_TK_CARR_GRP":"DL:DL", "MKT_MILES_FLOWN":"1566", "MKT_AIRPORT_GROUP":"ABE:ATL:SAT", "MKT_FARE_AMT":"393.5", "MKT_ORIG_STATE_NM":"Pennsylvania", "MKT_DEST_ARPT_CITY_NM":"33214", "MKT_RPTG_CARR_NM":"9E", "MKT_DEST":"SAT", "MKT_DEST_CNTRY":"US", "MKT_ORIG_CNTRY":"US", "Coupon":[
{
"CPN_STATE_NM":"Georgia", "CPN_DEST":"ATL", "CPN_TKT_CARR_NM":"DL", "TRIP_BREAK":"", "CPN_MKT_ORIG_ARPT_NM":"10135", "CLASS_OF_SVC":"X", "CPN_TKT_NBR":"2017245", "CPN_DEST_CITY_MKT_NM":"30397", "CPN_DISTANCE":"692", "SEQ_NUM":"1", "ITIN_GEO_TYPE":"2", "CPN_RPTG_CARR_NM":"9E", "COUPON_GEO_TYPE":"2", "CPN_ORIG_STATE_NM":"Pennsylvania", "CPN_OPERG_CARR_NM":"9E", "CPN_ORIG":"ABE", "CPN_PASSENGERS":"1", "COUPON_TYPE":"A", "CPN_DEST_ARPT_NM":"10397", "CPN_MKT_ORIG_CITY_NM":"30135", "CPN_DEST_CNTRY":"US", "CPN_MKT_ID":"201724501", "CPN_ORIG_CNTRY":"US" }, {
"CPN_STATE_NM":"Texas", "CPN_DEST":"SAT", "CPN_TKT_CARR_NM":"DL", "TRIP_BREAK":"X", "CPN_MKT_ORIG_ARPT_NM":"10397", "CLASS_OF_SVC":"X", "CPN_TKT_NBR":"2017245", "CPN_DEST_CITY_MKT_NM":"33214", "CPN_DISTANCE":"874", "SEQ_NUM":"2", "ITIN_GEO_TYPE":"2", "CPN_RPTG_CARR_NM":"9E", "COUPON_GEO_TYPE":"2", "CPN_ORIG_STATE_NM":"Georgia", "CPN_OPERG_CARR_NM":"DL", "CPN_ORIG":"ATL", "CPN_PASSENGERS":"1", "COUPON_TYPE":"A", "CPN_DEST_ARPT_NM":"14683", "CPN_MKT_ORIG_CITY_NM":"30397", "CPN_DEST_CNTRY":"US", "CPN_MKT_ID":"201724501", "CPN_ORIG_CNTRY":"US" } ], "MKT_ITIN_ID":"2017245", "MKT_OPERG_CARR_NM":"99", "MKT_DEST_ARPT_NM":"14683", "MKT_ORIG_ARPT_NM":"ABE", "MKT_ITIN_GEO_TYPE":"2", "MKT_PASSENGERS":"1", "MKT_ID":"201724501", "MKT_TKT_CARR_NM":"DL" }, {
"MKT_DISTANCE":"1566", "MKT_BULK_FARE":"0", "MKT_NO_OF_CPNS":"2", "MKT_DEST_STATE_NM":"Pennsylvania", "MKT_OP_CARR_GRP":"DL:DL", "MKT_TK_CARR_GRP":"DL:DL", "MKT_MILES_FLOWN":"1566", "MKT_AIRPORT_GROUP":"SAT:ATL:ABE", "MKT_FARE_AMT":"393.5", "MKT_ORIG_STATE_NM":"Texas", "MKT_DEST_ARPT_CITY_NM":"30135", "MKT_RPTG_CARR_NM":"9E", "MKT_DEST":"ABE", "MKT_DEST_CNTRY":"US", "MKT_ORIG_CNTRY":"US", "Coupon":[
{
"CPN_STATE_NM":"Georgia", "CPN_DEST":"ATL", "CPN_TKT_CARR_NM":"DL", "TRIP_BREAK":"", "CPN_MKT_ORIG_ARPT_NM":"14683", "CLASS_OF_SVC":"X", "CPN_TKT_NBR":"2017245", "CPN_DEST_CITY_MKT_NM":"30397", "CPN_DISTANCE":"874", "SEQ_NUM":"3", "ITIN_GEO_TYPE":"2", "CPN_RPTG_CARR_NM":"9E", "COUPON_GEO_TYPE":"2", "CPN_ORIG_STATE_NM":"Texas", "CPN_OPERG_CARR_NM":"DL", "CPN_ORIG":"SAT", "CPN_PASSENGERS":"1", "COUPON_TYPE":"A", "CPN_DEST_ARPT_NM":"10397", "CPN_MKT_ORIG_CITY_NM":"33214", "CPN_DEST_CNTRY":"US", "CPN_MKT_ID":"201724503", "CPN_ORIG_CNTRY":"US" }, {
"CPN_STATE_NM":"Pennsylvania", "CPN_DEST":"ABE", "CPN_TKT_CARR_NM":"DL", "TRIP_BREAK":"X", "CPN_MKT_ORIG_ARPT_NM":"10397", "CLASS_OF_SVC":"X", "CPN_TKT_NBR":"2017245", "CPN_DEST_CITY_MKT_NM":"30135", "CPN_DISTANCE":"692", "SEQ_NUM":"4", "ITIN_GEO_TYPE":"2", "CPN_RPTG_CARR_NM":"9E", "COUPON_GEO_TYPE":"2", "CPN_ORIG_STATE_NM":"Georgia", "CPN_OPERG_CARR_NM":"DL", "CPN_ORIG":"ATL", "CPN_PASSENGERS":"1", "COUPON_TYPE":"A", "CPN_DEST_ARPT_NM":"10135", "CPN_MKT_ORIG_CITY_NM":"30397", "CPN_DEST_CNTRY":"US", "CPN_MKT_ID":"201724503", "CPN_ORIG_CNTRY":"US" } ], "MKT_ITIN_ID":"2017245", "MKT_OPERG_CARR_NM":"DL", "MKT_DEST_ARPT_NM":"10135", "MKT_ORIG_ARPT_NM":"SAT", "MKT_ITIN_GEO_TYPE":"2", "MKT_PASSENGERS":"1", "MKT_ID":"201724503", "MKT_TKT_CARR_NM":"DL" } ], "NO_OF_CPNS":"4", "ORIGIN_COUNTRY":"US", "ITIN_ID":"2017245", "PASSENGERS":"1", "MILES_FLOWN":"3132" }

You can use the from_json() helper function within the select() Dataset API call, to extract or decode data's attributes and values from a JSON string into a DataFrame as columns, dictated by a schema.

example, given the following json { "reporting_carrier": "A", "market": { "value": 10 } } , stored in the rawJsonDf

case class MarketData (reporting_carrier: String, market_json: String)

val jsonSchema = new StructType()
  .add("value", LongType)

rawJsonDf
  .toDf("reporting_carrier","market")
  .as[MarketData]

rawJsonDf
  .select(from_json($"market_json", jsonSchema) as "market")
  .filter($"market.value" > 5)

see this great tutorial by databricks for more info .

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM