![](/img/trans.png)
[英]How to convert DataFrame to Dataset in Apache Spark in Java?
[英]How to add JSON Object to a Dataset/Dataframe in apache spark
我想知道是否有可能使用Spark Dataset API創建自定義JSON
或Apache Spark提供的任何其他功能。 我知道我可以使用join()
方法加入兩個數據集,但是我想創建自定義JSON,其中數據集2即在我的情況下將警報添加到數據集1中,即以鍵為“ ALERT”的JSON對象作為庫存。
Dataset<Row> inventory = spark.read().option("multiLine", true).option("mode", "PERMISSIVE")
.json("C:\\Users\\phyadavi\\LearningAndDevelopment\\\\CDXJSONMergeJob\\data1\\inventory.json");
Dataset<Row> alerts = spark.read().option("multiLine", true).option("mode", "PERMISSIVE")
.json("C:\\Users\\phyadavi\\LearningAndDevelopment\\\\CDXJSONMergeJob\\data1\\alert.json");
Dataset<Row> inventoryAlerts = inventory.join(alerts);
inventoryAlerts.printSchema();
庫存和警報的架構如下。
root
|-- Equipment: struct (nullable = true)
| |-- items: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- collectedPid: string (nullable = true)
| | | |-- collectedSerialNum: string (nullable = true)
| | | |-- containingHwId: string (nullable = true)
| | | |-- equipmentType: string (nullable = true)
| | | |-- hwId: string (nullable = true)
| | | |-- items: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- tagName: string (nullable = true)
| | | | | |-- tagValue: string (nullable = true)
| | | |-- pceMultiPid: string (nullable = true)
| | | |-- pcePhyiscalType: string (nullable = true)
| | | |-- pcePid: string (nullable = true)
| | | |-- pceProductDescription: string (nullable = true)
| | | |-- pceProductFamily: string (nullable = true)
| | | |-- pceProductType: string (nullable = true)
| | | |-- pceRuleId: string (nullable = true)
| | | |-- productDescription: string (nullable = true)
| | | |-- productFamily: string (nullable = true)
| | | |-- productId: string (nullable = true)
| | | |-- productType: string (nullable = true)
| | | |-- serialNumber: string (nullable = true)
| | | |-- snasItemType: string (nullable = true)
| | | |-- snasProductFamily: string (nullable = true)
| | | |-- snasSerialNumber: string (nullable = true)
| | | |-- snasValidationCode: string (nullable = true)
| | | |-- snasValidationSource: string (nullable = true)
|-- LicenseActivated: struct (nullable = true)
| |-- items: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- count: long (nullable = true)
| | | |-- type: string (nullable = true)
|-- NetworkElement: struct (nullable = true)
| |-- items: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- hostname: string (nullable = true)
| | | |-- ipAddress: string (nullable = true)
| | | |-- isManagedNe: boolean (nullable = true)
| | | |-- items: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- tagName: string (nullable = true)
| | | | | |-- tagValue: string (nullable = true)
| | | |-- lastUpdateDate: long (nullable = true)
| | | |-- managedNeId: string (nullable = true)
| | | |-- managementAddress: string (nullable = true)
| | | |-- neId: string (nullable = true)
| | | |-- neName: string (nullable = true)
| | | |-- neRegistrationStatus: string (nullable = true)
| | | |-- productFamily: string (nullable = true)
| | | |-- productId: string (nullable = true)
| | | |-- productType: string (nullable = true)
| | | |-- serialNumber: string (nullable = true)
| | | |-- smartLicenseProductInstanceIdentifier: string (nullable = true)
| | | |-- smartLicenseVirtualAccountName: string (nullable = true)
| | | |-- softwareType: string (nullable = true)
| | | |-- softwareVersion: string (nullable = true)
| | | |-- systemUptime: long (nullable = true)
| | | |-- udiProductIdentifier: string (nullable = true)
|-- Versions: struct (nullable = true)
| |-- items: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- lastUpdated: long (nullable = true)
| | | |-- type: string (nullable = true)
| | | |-- version: string (nullable = true)
|-- collectorId: string (nullable = true)
|-- generatedAt: long (nullable = true)
|-- managedNeId: string (nullable = true)
|-- partyId: string (nullable = true)
|-- recordType: string (nullable = true)
|-- sourceNeId: string (nullable = true)
|-- sourcePartyId: string (nullable = true)
|-- sourceSubPartyId: string (nullable = true)
|-- wfid: string (nullable = true)
#####################################
root
|-- collectorId: string (nullable = true)
|-- generatedAt: long (nullable = true)
|-- managedNeId: string (nullable = true)
|-- neAlert: struct (nullable = true)
| |-- advisory: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- equipmentType: string (nullable = true)
| | | |-- headlineName: string (nullable = true)
| | | |-- hwId: string (nullable = true)
| | | |-- neId: string (nullable = true)
| | | |-- productFamily: string (nullable = true)
| | | |-- productId: string (nullable = true)
| | | |-- psirtId: long (nullable = true)
| | | |-- publicReleaseInd: string (nullable = true)
| | | |-- softwareType: string (nullable = true)
| | | |-- softwareVersion: string (nullable = true)
| | | |-- vulnerabilityReason: string (nullable = true)
| | | |-- vulnerabilityStatus: string (nullable = true)
| |-- fieldNotice: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- caveat: string (nullable = true)
| | | |-- distributionCode: string (nullable = true)
| | | |-- equipmentType: string (nullable = true)
| | | |-- fieldNoticeId: long (nullable = true)
| | | |-- fieldNoticeName: string (nullable = true)
| | | |-- hwId: string (nullable = true)
| | | |-- neId: string (nullable = true)
| | | |-- productFamily: string (nullable = true)
| | | |-- productId: string (nullable = true)
| | | |-- serialNumber: string (nullable = true)
| | | |-- softwareType: string (nullable = true)
| | | |-- vulnerabilityReason: string (nullable = true)
| | | |-- vulnerabilityStatus: string (nullable = true)
| |-- hwEoX: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- bulletinName: string (nullable = true)
| | | |-- equipmentType: string (nullable = true)
| | | |-- hardwareEoXId: long (nullable = true)
| | | |-- hwId: string (nullable = true)
| | | |-- neId: string (nullable = true)
| | | |-- productId: string (nullable = true)
| |-- swEoX: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- bulletinHeadline: string (nullable = true)
| | | |-- equipmentType: string (nullable = true)
| | | |-- neId: string (nullable = true)
| | | |-- productId: string (nullable = true)
| | | |-- softwareEoXId: long (nullable = true)
| | | |-- softwareType: string (nullable = true)
| | | |-- softwareVersion: string (nullable = true)
|-- partyId: string (nullable = true)
|-- recordType: string (nullable = true)
|-- sourceNeId: string (nullable = true)
|-- sourcePartyId: string (nullable = true)
|-- sourceSubPartyId: string (nullable = true)
|-- wfid: string (nullable = true)
如果要聯接並保留一個數據集中的字段為嵌套,則可以使用struct
創建一個StructType列並按以下方式聯接
import org.apache.spark.sql.functions.udf
Dataset<Row> inventory = spark.read().option("multiLine", true).option("mode", "PERMISSIVE")
.json("path to json inventory");
Dataset<Row> alerts = spark.read().option("multiLine", true).option("mode", "PERMISSIVE")
.json("path to alerts json")
.select($"partyId", struct("columns").as("ALERTS"));
//column names are all the columns that you want in nested fiels with comma separated
Dataset<Row> inventoryAlerts = inventory.join(alerts);
inventoryAlerts.printSchema();
join
后,這應該為您提供所需的schema
。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.