Using Scala and IntelliJ,
I have an xml file and I have written it to a dataframe as shown below:
var dftest = spark.read.format("com.databricks.spark.xml").option("rowTag","transferBatch").load(file)
The schema is long and has many sequence element nodes. Some of the columns also have different data types.
root
|-- accountingInfo: struct (nullable = true)
| |-- currencyConversion: struct (nullable = true)
| | |-- ExchangeRateDefinition: struct (nullable = true)
| | | |-- exchangeRate: long (nullable = true)
| | | |-- exchangeRateCode: long (nullable = true)
| | | |-- numberOfDecimalPlaces: long (nullable = true)
| |-- localCurrency: string (nullable = true)
| |-- tapDecimalPlaces: long (nullable = true)
|-- auditControlInfo: struct (nullable = true)
| |-- callEventDetailsCount: long (nullable = true)
| |-- earliestCallTimeStamp: struct (nullable = true)
| | |-- localTimeStamp: string (nullable = true)
| | |-- utcTimeOffset: string (nullable = true)
| |-- latestCallTimeStamp: struct (nullable = true)
| | |-- localTimeStamp: string (nullable = true)
| | |-- utcTimeOffset: string (nullable = true)
| |-- operatorSpecInformation: struct (nullable = true)
| | |-- OperatorSpecInformation: array (nullable = true)
| | | |-- element: string (containsNull = true)
| |-- totalChargeValueList: struct (nullable = true)
| | |-- TotalChargeValue: struct (nullable = true)
| | | |-- chargeType: string (nullable = true)
| | | |-- totalCharge: long (nullable = true)
| |-- totalDiscountValue: long (nullable = true)
| |-- totalTaxValue: long (nullable = true)
|-- batchControlInfo: struct (nullable = true)
| |-- fileAvailableTimeStamp: struct (nullable = true)
| | |-- localTimeStamp: string (nullable = true)
| | |-- utcTimeOffset: string (nullable = true)
| |-- fileCreationTimeStamp: struct (nullable = true)
| | |-- localTimeStamp: string (nullable = true)
| | |-- utcTimeOffset: string (nullable = true)
| |-- fileSequenceNumber: string (nullable = true)
| |-- recipient: string (nullable = true)
| |-- releaseVersionNumber: long (nullable = true)
| |-- sender: string (nullable = true)
| |-- specificationVersionNumber: long (nullable = true)
| |-- transferCutOffTimeStamp: struct (nullable = true)
| | |-- localTimeStamp: string (nullable = true)
| | |-- utcTimeOffset: string (nullable = true)
|-- callEventDetails: struct (nullable = true)
| |-- gprsCall: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- equipmentInformation: struct (nullable = true)
| | | | |-- imeiOrEsn: struct (nullable = true)
| | | | | |-- imei: string (nullable = true)
| | | |-- gprsBasicCallInformation: struct (nullable = true)
| | | | |-- callEventStartTimeStamp: struct (nullable = true)
| | | | | |-- localTimeStamp: string (nullable = true)
| | | | | |-- utcTimeOffsetCode: long (nullable = true)
| | | | |-- chargeableSubscriber: struct (nullable = true)
| | | | | |-- chargeableSubscriber: struct (nullable = true)
| | | | | | |-- simChargeableSubscriber: struct (nullable = true)
| | | | | | | |-- imsi: string (nullable = true)
| | | | | | | |-- msisdn: string (nullable = true)
| | | | | |-- pdpAddress: string (nullable = true)
| | | | | |-- pdpType: long (nullable = true)
| | | | |-- chargingId: string (nullable = true)
| | | | |-- gprsDestination: struct (nullable = true)
| | | | | |-- accessPointNameNI: string (nullable = true)
| | | | | |-- accessPointNameOI: string (nullable = true)
| | | | |-- totalCallEventDuration: long (nullable = true)
| | | |-- gprsLocationInformation: struct (nullable = true)
| | | | |-- gprsNetworkLocation: struct (nullable = true)
| | | | | |-- cellId: long (nullable = true)
| | | | | |-- locationArea: long (nullable = true)
| | | | | |-- recEntity: struct (nullable = true)
| | | | | | |-- RecEntityCode: array (nullable = true)
| | | | | | | |-- element: long (containsNull = true)
| | | |-- gprsServiceUsed: struct (nullable = true)
| | | | |-- chargeInformationList: struct (nullable = true)
| | | | | |-- ChargeInformation: struct (nullable = true)
| | | | | | |-- chargeDetailList: struct (nullable = true)
| | | | | | | |-- ChargeDetail: struct (nullable = true)
| | | | | | | | |-- charge: long (nullable = true)
| | | | | | | | |-- chargeType: string (nullable = true)
| | | | | | | | |-- chargeableUnits: long (nullable = true)
| | | | | | | | |-- chargedUnits: long (nullable = true)
| | | | | | | | |-- dayCategory: long (nullable = true)
| | | | | | | | |-- timeBand: long (nullable = true)
| | | | | | |-- chargedItem: long (nullable = true)
| | | | | | |-- exchangeRateCode: long (nullable = true)
| | | | |-- gprsServiceUsageList: struct (nullable = true)
| | | | | |-- GprsServiceUsage: struct (nullable = true)
| | | | | | |-- dataVolumeIncoming: long (nullable = true)
| | | | | | |-- dataVolumeOutgoing: long (nullable = true)
| | | |-- operatorSpecInformation: struct (nullable = true)
| | | | |-- OperatorSpecInformation: array (nullable = true)
| | | | | |-- element: string (containsNull = true)
| | | |-- typeOfControllingNode: long (nullable = true)
| |-- mobileOriginatedCall: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- basicCallInformation: struct (nullable = true)
| | | | |-- callEventStartTimeStamp: struct (nullable = true)
| | | | | |-- localTimeStamp: string (nullable = true)
| | | | | |-- utcTimeOffsetCode: long (nullable = true)
| | | | |-- chargeableSubscriber: struct (nullable = true)
| | | | | |-- simChargeableSubscriber: struct (nullable = true)
| | | | | | |-- imsi: string (nullable = true)
| | | | | | |-- msisdn: string (nullable = true)
| | | | |-- destination: struct (nullable = true)
| | | | | |-- calledNumber: string (nullable = true)
| | | | |-- totalCallEventDuration: long (nullable = true)
| | | |-- basicServiceUsedList: struct (nullable = true)
| | | | |-- BasicServiceUsed: struct (nullable = true)
| | | | | |-- basicService: struct (nullable = true)
| | | | | | |-- serviceCode: struct (nullable = true)
| | | | | | | |-- teleServiceCode: string (nullable = true)
| | | | | |-- chargeInformationList: struct (nullable = true)
| | | | | | |-- ChargeInformation: struct (nullable = true)
| | | | | | | |-- callTypeGroup: struct (nullable = true)
| | | | | | | | |-- callTypeLevel1: long (nullable = true)
| | | | | | | | |-- callTypeLevel2: long (nullable = true)
| | | | | | | | |-- callTypeLevel3: long (nullable = true)
| | | | | | | | |-- calledCountryCode: string (nullable = true)
| | | | | | | |-- chargeDetailList: struct (nullable = true)
| | | | | | | | |-- ChargeDetail: struct (nullable = true)
| | | | | | | | | |-- charge: long (nullable = true)
| | | | | | | | | |-- chargeType: string (nullable = true)
| | | | | | | | | |-- chargeableUnits: long (nullable = true)
| | | | | | | | | |-- chargedUnits: long (nullable = true)
| | | | | | | | | |-- dayCategory: long (nullable = true)
| | | | | | | | | |-- timeBand: long (nullable = true)
| | | | | | | |-- chargedItem: long (nullable = true)
| | | | | | | |-- exchangeRateCode: long (nullable = true)
| | | |-- equipmentInformation: struct (nullable = true)
| | | | |-- imeiOrEsn: struct (nullable = true)
| | | | | |-- imei: string (nullable = true)
| | | |-- locationInformation: struct (nullable = true)
| | | | |-- networkLocation: struct (nullable = true)
| | | | | |-- callReference: string (nullable = true)
| | | | | |-- cellId: long (nullable = true)
| | | | | |-- locationArea: long (nullable = true)
| | | | | |-- recEntityCode: long (nullable = true)
| | | |-- operatorSpecInformation: struct (nullable = true)
| | | | |-- OperatorSpecInformation: array (nullable = true)
| | | | | |-- element: string (containsNull = true)
| |-- mobileTerminatedCall: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- basicCallInformation: struct (nullable = true)
| | | | |-- callEventStartTimeStamp: struct (nullable = true)
| | | | | |-- localTimeStamp: string (nullable = true)
| | | | | |-- utcTimeOffsetCode: long (nullable = true)
| | | | |-- callOriginator: struct (nullable = true)
| | | | | |-- callingNumber: string (nullable = true)
| | | | |-- chargeableSubscriber: struct (nullable = true)
| | | | | |-- simChargeableSubscriber: struct (nullable = true)
| | | | | | |-- imsi: string (nullable = true)
| | | | | | |-- msisdn: string (nullable = true)
| | | | |-- totalCallEventDuration: long (nullable = true)
| | | |-- basicServiceUsedList: struct (nullable = true)
| | | | |-- BasicServiceUsed: struct (nullable = true)
| | | | | |-- basicService: struct (nullable = true)
| | | | | | |-- serviceCode: struct (nullable = true)
| | | | | | | |-- teleServiceCode: string (nullable = true)
| | | | | |-- chargeInformationList: struct (nullable = true)
| | | | | | |-- ChargeInformation: struct (nullable = true)
| | | | | | | |-- chargeDetailList: struct (nullable = true)
| | | | | | | | |-- ChargeDetail: struct (nullable = true)
| | | | | | | | | |-- charge: long (nullable = true)
| | | | | | | | | |-- chargeType: string (nullable = true)
| | | | | | | | | |-- chargeableUnits: long (nullable = true)
| | | | | | | | | |-- chargedUnits: long (nullable = true)
| | | | | | | | | |-- dayCategory: long (nullable = true)
| | | | | | | | | |-- timeBand: long (nullable = true)
| | | | | | | |-- chargedItem: long (nullable = true)
| | | | | | | |-- exchangeRateCode: long (nullable = true)
| | | |-- equipmentInformation: struct (nullable = true)
| | | | |-- imeiOrEsn: struct (nullable = true)
| | | | | |-- imei: string (nullable = true)
| | | |-- locationInformation: struct (nullable = true)
| | | | |-- networkLocation: struct (nullable = true)
| | | | | |-- callReference: string (nullable = true)
| | | | | |-- cellId: long (nullable = true)
| | | | | |-- locationArea: long (nullable = true)
| | | | | |-- recEntityCode: long (nullable = true)
| | | |-- operatorSpecInformation: struct (nullable = true)
| | | | |-- OperatorSpecInformation: array (nullable = true)
| | | | | |-- element: string (containsNull = true)
|-- networkInfo: struct (nullable = true)
| |-- calledNumAnalysis: struct (nullable = true)
| | |-- CalledNumAnalysis: struct (nullable = true)
| | | |-- calledNumAnalysisCode: long (nullable = true)
| | | |-- countryCodeTable: struct (nullable = true)
| | | | |-- CountryCode: string (nullable = true)
| | | |-- iacTable: struct (nullable = true)
| | | | |-- Iac: string (nullable = true)
| |-- networkType: long (nullable = true)
| |-- recEntityInfo: struct (nullable = true)
| | |-- RecEntityDefinition: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- recEntityCode: long (nullable = true)
| | | | |-- recEntityId: struct (nullable = true)
| | | | | |-- gsnaddress: struct (nullable = true)
| | | | | | |-- iPTextV4Address: string (nullable = true)
| | | | | |-- mscId: string (nullable = true)
| | | | | |-- msisdn: string (nullable = true)
| | | | |-- recEntityType: long (nullable = true)
| |-- utcTimeOffsetInfo: struct (nullable = true)
| | |-- UtcTimeOffsetDefinition: struct (nullable = true)
| | | |-- utcTimeOffset: string (nullable = true)
| | | |-- utcTimeOffsetCode: long (nullable = true)
When I want to see the elements in the dataframe, it is shown in a table like this: table
I'm unsure of how can I write this dataframe to a csv file.
Any advice? Thanks
Please take a look at Spark-csv library from Databricks:
Here is a simple example:
mydf.write.
format("com.databricks.spark.csv").
option("header", "true").
save("out.csv")
You can find it here: https://mvnrepository.com/artifact/com.databricks/spark-csv_2.10
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.