簡體   English   中英

如何在Apache Spark中將JSON對象添加到數據集/數據框

[英]How to add JSON Object to a Dataset/Dataframe in apache spark

我想知道是否有可能使用Spark Dataset API創建自定義JSON

或Apache Spark提供的任何其他功能。 我知道我可以使用join()方法加入兩個數據集,但是我想創建自定義JSON,其中數據集2即在我的情況下將警報添加到數據集1中,即以鍵為“ ALERT”的JSON對象作為庫存。

Dataset<Row> inventory = spark.read().option("multiLine", true).option("mode", "PERMISSIVE")
                    .json("C:\\Users\\phyadavi\\LearningAndDevelopment\\\\CDXJSONMergeJob\\data1\\inventory.json");
Dataset<Row> alerts = spark.read().option("multiLine", true).option("mode", "PERMISSIVE")
                    .json("C:\\Users\\phyadavi\\LearningAndDevelopment\\\\CDXJSONMergeJob\\data1\\alert.json");

Dataset<Row> inventoryAlerts = inventory.join(alerts);
        inventoryAlerts.printSchema();

庫存和警報的架構如下。

root
 |-- Equipment: struct (nullable = true)
 |    |-- items: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- collectedPid: string (nullable = true)
 |    |    |    |-- collectedSerialNum: string (nullable = true)
 |    |    |    |-- containingHwId: string (nullable = true)
 |    |    |    |-- equipmentType: string (nullable = true)
 |    |    |    |-- hwId: string (nullable = true)
 |    |    |    |-- items: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- tagName: string (nullable = true)
 |    |    |    |    |    |-- tagValue: string (nullable = true)
 |    |    |    |-- pceMultiPid: string (nullable = true)
 |    |    |    |-- pcePhyiscalType: string (nullable = true)
 |    |    |    |-- pcePid: string (nullable = true)
 |    |    |    |-- pceProductDescription: string (nullable = true)
 |    |    |    |-- pceProductFamily: string (nullable = true)
 |    |    |    |-- pceProductType: string (nullable = true)
 |    |    |    |-- pceRuleId: string (nullable = true)
 |    |    |    |-- productDescription: string (nullable = true)
 |    |    |    |-- productFamily: string (nullable = true)
 |    |    |    |-- productId: string (nullable = true)
 |    |    |    |-- productType: string (nullable = true)
 |    |    |    |-- serialNumber: string (nullable = true)
 |    |    |    |-- snasItemType: string (nullable = true)
 |    |    |    |-- snasProductFamily: string (nullable = true)
 |    |    |    |-- snasSerialNumber: string (nullable = true)
 |    |    |    |-- snasValidationCode: string (nullable = true)
 |    |    |    |-- snasValidationSource: string (nullable = true)
 |-- LicenseActivated: struct (nullable = true)
 |    |-- items: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- count: long (nullable = true)
 |    |    |    |-- type: string (nullable = true)
 |-- NetworkElement: struct (nullable = true)
 |    |-- items: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- hostname: string (nullable = true)
 |    |    |    |-- ipAddress: string (nullable = true)
 |    |    |    |-- isManagedNe: boolean (nullable = true)
 |    |    |    |-- items: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- tagName: string (nullable = true)
 |    |    |    |    |    |-- tagValue: string (nullable = true)
 |    |    |    |-- lastUpdateDate: long (nullable = true)
 |    |    |    |-- managedNeId: string (nullable = true)
 |    |    |    |-- managementAddress: string (nullable = true)
 |    |    |    |-- neId: string (nullable = true)
 |    |    |    |-- neName: string (nullable = true)
 |    |    |    |-- neRegistrationStatus: string (nullable = true)
 |    |    |    |-- productFamily: string (nullable = true)
 |    |    |    |-- productId: string (nullable = true)
 |    |    |    |-- productType: string (nullable = true)
 |    |    |    |-- serialNumber: string (nullable = true)
 |    |    |    |-- smartLicenseProductInstanceIdentifier: string (nullable = true)
 |    |    |    |-- smartLicenseVirtualAccountName: string (nullable = true)
 |    |    |    |-- softwareType: string (nullable = true)
 |    |    |    |-- softwareVersion: string (nullable = true)
 |    |    |    |-- systemUptime: long (nullable = true)
 |    |    |    |-- udiProductIdentifier: string (nullable = true)
 |-- Versions: struct (nullable = true)
 |    |-- items: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- lastUpdated: long (nullable = true)
 |    |    |    |-- type: string (nullable = true)
 |    |    |    |-- version: string (nullable = true)
 |-- collectorId: string (nullable = true)
 |-- generatedAt: long (nullable = true)
 |-- managedNeId: string (nullable = true)
 |-- partyId: string (nullable = true)
 |-- recordType: string (nullable = true)
 |-- sourceNeId: string (nullable = true)
 |-- sourcePartyId: string (nullable = true)
 |-- sourceSubPartyId: string (nullable = true)
 |-- wfid: string (nullable = true)

#####################################
root
 |-- collectorId: string (nullable = true)
 |-- generatedAt: long (nullable = true)
 |-- managedNeId: string (nullable = true)
 |-- neAlert: struct (nullable = true)
 |    |-- advisory: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- equipmentType: string (nullable = true)
 |    |    |    |-- headlineName: string (nullable = true)
 |    |    |    |-- hwId: string (nullable = true)
 |    |    |    |-- neId: string (nullable = true)
 |    |    |    |-- productFamily: string (nullable = true)
 |    |    |    |-- productId: string (nullable = true)
 |    |    |    |-- psirtId: long (nullable = true)
 |    |    |    |-- publicReleaseInd: string (nullable = true)
 |    |    |    |-- softwareType: string (nullable = true)
 |    |    |    |-- softwareVersion: string (nullable = true)
 |    |    |    |-- vulnerabilityReason: string (nullable = true)
 |    |    |    |-- vulnerabilityStatus: string (nullable = true)
 |    |-- fieldNotice: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- caveat: string (nullable = true)
 |    |    |    |-- distributionCode: string (nullable = true)
 |    |    |    |-- equipmentType: string (nullable = true)
 |    |    |    |-- fieldNoticeId: long (nullable = true)
 |    |    |    |-- fieldNoticeName: string (nullable = true)
 |    |    |    |-- hwId: string (nullable = true)
 |    |    |    |-- neId: string (nullable = true)
 |    |    |    |-- productFamily: string (nullable = true)
 |    |    |    |-- productId: string (nullable = true)
 |    |    |    |-- serialNumber: string (nullable = true)
 |    |    |    |-- softwareType: string (nullable = true)
 |    |    |    |-- vulnerabilityReason: string (nullable = true)
 |    |    |    |-- vulnerabilityStatus: string (nullable = true)
 |    |-- hwEoX: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- bulletinName: string (nullable = true)
 |    |    |    |-- equipmentType: string (nullable = true)
 |    |    |    |-- hardwareEoXId: long (nullable = true)
 |    |    |    |-- hwId: string (nullable = true)
 |    |    |    |-- neId: string (nullable = true)
 |    |    |    |-- productId: string (nullable = true)
 |    |-- swEoX: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- bulletinHeadline: string (nullable = true)
 |    |    |    |-- equipmentType: string (nullable = true)
 |    |    |    |-- neId: string (nullable = true)
 |    |    |    |-- productId: string (nullable = true)
 |    |    |    |-- softwareEoXId: long (nullable = true)
 |    |    |    |-- softwareType: string (nullable = true)
 |    |    |    |-- softwareVersion: string (nullable = true)
 |-- partyId: string (nullable = true)
 |-- recordType: string (nullable = true)
 |-- sourceNeId: string (nullable = true)
 |-- sourcePartyId: string (nullable = true)
 |-- sourceSubPartyId: string (nullable = true)
 |-- wfid: string (nullable = true)

如果要聯接並保留一個數據集中的字段為嵌套,則可以使用struct創建一個StructType列並按以下方式聯接

import org.apache.spark.sql.functions.udf

Dataset<Row> inventory = spark.read().option("multiLine", true).option("mode", "PERMISSIVE")
                    .json("path to json inventory");
Dataset<Row> alerts = spark.read().option("multiLine", true).option("mode", "PERMISSIVE")
                           .json("path to alerts json")
                           .select($"partyId", struct("columns").as("ALERTS"));
//column names are all the columns that you want in nested fiels with comma separated

Dataset<Row> inventoryAlerts = inventory.join(alerts);
        inventoryAlerts.printSchema();

join后,這應該為您提供所需的schema

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM