简体   繁体   中英

Scala: Convert xml dataframe to csv file

Using Scala and IntelliJ,

I have an xml file and I have written it to a dataframe as shown below:

var dftest = spark.read.format("com.databricks.spark.xml").option("rowTag","transferBatch").load(file)

The schema is long and has many sequence element nodes. Some of the columns also have different data types.

root
 |-- accountingInfo: struct (nullable = true)
 |    |-- currencyConversion: struct (nullable = true)
 |    |    |-- ExchangeRateDefinition: struct (nullable = true)
 |    |    |    |-- exchangeRate: long (nullable = true)
 |    |    |    |-- exchangeRateCode: long (nullable = true)
 |    |    |    |-- numberOfDecimalPlaces: long (nullable = true)
 |    |-- localCurrency: string (nullable = true)
 |    |-- tapDecimalPlaces: long (nullable = true)
 |-- auditControlInfo: struct (nullable = true)
 |    |-- callEventDetailsCount: long (nullable = true)
 |    |-- earliestCallTimeStamp: struct (nullable = true)
 |    |    |-- localTimeStamp: string (nullable = true)
 |    |    |-- utcTimeOffset: string (nullable = true)
 |    |-- latestCallTimeStamp: struct (nullable = true)
 |    |    |-- localTimeStamp: string (nullable = true)
 |    |    |-- utcTimeOffset: string (nullable = true)
 |    |-- operatorSpecInformation: struct (nullable = true)
 |    |    |-- OperatorSpecInformation: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |-- totalChargeValueList: struct (nullable = true)
 |    |    |-- TotalChargeValue: struct (nullable = true)
 |    |    |    |-- chargeType: string (nullable = true)
 |    |    |    |-- totalCharge: long (nullable = true)
 |    |-- totalDiscountValue: long (nullable = true)
 |    |-- totalTaxValue: long (nullable = true)
 |-- batchControlInfo: struct (nullable = true)
 |    |-- fileAvailableTimeStamp: struct (nullable = true)
 |    |    |-- localTimeStamp: string (nullable = true)
 |    |    |-- utcTimeOffset: string (nullable = true)
 |    |-- fileCreationTimeStamp: struct (nullable = true)
 |    |    |-- localTimeStamp: string (nullable = true)
 |    |    |-- utcTimeOffset: string (nullable = true)
 |    |-- fileSequenceNumber: string (nullable = true)
 |    |-- recipient: string (nullable = true)
 |    |-- releaseVersionNumber: long (nullable = true)
 |    |-- sender: string (nullable = true)
 |    |-- specificationVersionNumber: long (nullable = true)
 |    |-- transferCutOffTimeStamp: struct (nullable = true)
 |    |    |-- localTimeStamp: string (nullable = true)
 |    |    |-- utcTimeOffset: string (nullable = true)
 |-- callEventDetails: struct (nullable = true)
 |    |-- gprsCall: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- equipmentInformation: struct (nullable = true)
 |    |    |    |    |-- imeiOrEsn: struct (nullable = true)
 |    |    |    |    |    |-- imei: string (nullable = true)
 |    |    |    |-- gprsBasicCallInformation: struct (nullable = true)
 |    |    |    |    |-- callEventStartTimeStamp: struct (nullable = true)
 |    |    |    |    |    |-- localTimeStamp: string (nullable = true)
 |    |    |    |    |    |-- utcTimeOffsetCode: long (nullable = true)
 |    |    |    |    |-- chargeableSubscriber: struct (nullable = true)
 |    |    |    |    |    |-- chargeableSubscriber: struct (nullable = true)
 |    |    |    |    |    |    |-- simChargeableSubscriber: struct (nullable = true)
 |    |    |    |    |    |    |    |-- imsi: string (nullable = true)
 |    |    |    |    |    |    |    |-- msisdn: string (nullable = true)
 |    |    |    |    |    |-- pdpAddress: string (nullable = true)
 |    |    |    |    |    |-- pdpType: long (nullable = true)
 |    |    |    |    |-- chargingId: string (nullable = true)
 |    |    |    |    |-- gprsDestination: struct (nullable = true)
 |    |    |    |    |    |-- accessPointNameNI: string (nullable = true)
 |    |    |    |    |    |-- accessPointNameOI: string (nullable = true)
 |    |    |    |    |-- totalCallEventDuration: long (nullable = true)
 |    |    |    |-- gprsLocationInformation: struct (nullable = true)
 |    |    |    |    |-- gprsNetworkLocation: struct (nullable = true)
 |    |    |    |    |    |-- cellId: long (nullable = true)
 |    |    |    |    |    |-- locationArea: long (nullable = true)
 |    |    |    |    |    |-- recEntity: struct (nullable = true)
 |    |    |    |    |    |    |-- RecEntityCode: array (nullable = true)
 |    |    |    |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- gprsServiceUsed: struct (nullable = true)
 |    |    |    |    |-- chargeInformationList: struct (nullable = true)
 |    |    |    |    |    |-- ChargeInformation: struct (nullable = true)
 |    |    |    |    |    |    |-- chargeDetailList: struct (nullable = true)
 |    |    |    |    |    |    |    |-- ChargeDetail: struct (nullable = true)
 |    |    |    |    |    |    |    |    |-- charge: long (nullable = true)
 |    |    |    |    |    |    |    |    |-- chargeType: string (nullable = true)
 |    |    |    |    |    |    |    |    |-- chargeableUnits: long (nullable = true)
 |    |    |    |    |    |    |    |    |-- chargedUnits: long (nullable = true)
 |    |    |    |    |    |    |    |    |-- dayCategory: long (nullable = true)
 |    |    |    |    |    |    |    |    |-- timeBand: long (nullable = true)
 |    |    |    |    |    |    |-- chargedItem: long (nullable = true)
 |    |    |    |    |    |    |-- exchangeRateCode: long (nullable = true)
 |    |    |    |    |-- gprsServiceUsageList: struct (nullable = true)
 |    |    |    |    |    |-- GprsServiceUsage: struct (nullable = true)
 |    |    |    |    |    |    |-- dataVolumeIncoming: long (nullable = true)
 |    |    |    |    |    |    |-- dataVolumeOutgoing: long (nullable = true)
 |    |    |    |-- operatorSpecInformation: struct (nullable = true)
 |    |    |    |    |-- OperatorSpecInformation: array (nullable = true)
 |    |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- typeOfControllingNode: long (nullable = true)
 |    |-- mobileOriginatedCall: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- basicCallInformation: struct (nullable = true)
 |    |    |    |    |-- callEventStartTimeStamp: struct (nullable = true)
 |    |    |    |    |    |-- localTimeStamp: string (nullable = true)
 |    |    |    |    |    |-- utcTimeOffsetCode: long (nullable = true)
 |    |    |    |    |-- chargeableSubscriber: struct (nullable = true)
 |    |    |    |    |    |-- simChargeableSubscriber: struct (nullable = true)
 |    |    |    |    |    |    |-- imsi: string (nullable = true)
 |    |    |    |    |    |    |-- msisdn: string (nullable = true)
 |    |    |    |    |-- destination: struct (nullable = true)
 |    |    |    |    |    |-- calledNumber: string (nullable = true)
 |    |    |    |    |-- totalCallEventDuration: long (nullable = true)
 |    |    |    |-- basicServiceUsedList: struct (nullable = true)
 |    |    |    |    |-- BasicServiceUsed: struct (nullable = true)
 |    |    |    |    |    |-- basicService: struct (nullable = true)
 |    |    |    |    |    |    |-- serviceCode: struct (nullable = true)
 |    |    |    |    |    |    |    |-- teleServiceCode: string (nullable = true)
 |    |    |    |    |    |-- chargeInformationList: struct (nullable = true)
 |    |    |    |    |    |    |-- ChargeInformation: struct (nullable = true)
 |    |    |    |    |    |    |    |-- callTypeGroup: struct (nullable = true)
 |    |    |    |    |    |    |    |    |-- callTypeLevel1: long (nullable = true)
 |    |    |    |    |    |    |    |    |-- callTypeLevel2: long (nullable = true)
 |    |    |    |    |    |    |    |    |-- callTypeLevel3: long (nullable = true)
 |    |    |    |    |    |    |    |    |-- calledCountryCode: string (nullable = true)
 |    |    |    |    |    |    |    |-- chargeDetailList: struct (nullable = true)
 |    |    |    |    |    |    |    |    |-- ChargeDetail: struct (nullable = true)
 |    |    |    |    |    |    |    |    |    |-- charge: long (nullable = true)
 |    |    |    |    |    |    |    |    |    |-- chargeType: string (nullable = true)
 |    |    |    |    |    |    |    |    |    |-- chargeableUnits: long (nullable = true)
 |    |    |    |    |    |    |    |    |    |-- chargedUnits: long (nullable = true)
 |    |    |    |    |    |    |    |    |    |-- dayCategory: long (nullable = true)
 |    |    |    |    |    |    |    |    |    |-- timeBand: long (nullable = true)
 |    |    |    |    |    |    |    |-- chargedItem: long (nullable = true)
 |    |    |    |    |    |    |    |-- exchangeRateCode: long (nullable = true)
 |    |    |    |-- equipmentInformation: struct (nullable = true)
 |    |    |    |    |-- imeiOrEsn: struct (nullable = true)
 |    |    |    |    |    |-- imei: string (nullable = true)
 |    |    |    |-- locationInformation: struct (nullable = true)
 |    |    |    |    |-- networkLocation: struct (nullable = true)
 |    |    |    |    |    |-- callReference: string (nullable = true)
 |    |    |    |    |    |-- cellId: long (nullable = true)
 |    |    |    |    |    |-- locationArea: long (nullable = true)
 |    |    |    |    |    |-- recEntityCode: long (nullable = true)
 |    |    |    |-- operatorSpecInformation: struct (nullable = true)
 |    |    |    |    |-- OperatorSpecInformation: array (nullable = true)
 |    |    |    |    |    |-- element: string (containsNull = true)
 |    |-- mobileTerminatedCall: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- basicCallInformation: struct (nullable = true)
 |    |    |    |    |-- callEventStartTimeStamp: struct (nullable = true)
 |    |    |    |    |    |-- localTimeStamp: string (nullable = true)
 |    |    |    |    |    |-- utcTimeOffsetCode: long (nullable = true)
 |    |    |    |    |-- callOriginator: struct (nullable = true)
 |    |    |    |    |    |-- callingNumber: string (nullable = true)
 |    |    |    |    |-- chargeableSubscriber: struct (nullable = true)
 |    |    |    |    |    |-- simChargeableSubscriber: struct (nullable = true)
 |    |    |    |    |    |    |-- imsi: string (nullable = true)
 |    |    |    |    |    |    |-- msisdn: string (nullable = true)
 |    |    |    |    |-- totalCallEventDuration: long (nullable = true)
 |    |    |    |-- basicServiceUsedList: struct (nullable = true)
 |    |    |    |    |-- BasicServiceUsed: struct (nullable = true)
 |    |    |    |    |    |-- basicService: struct (nullable = true)
 |    |    |    |    |    |    |-- serviceCode: struct (nullable = true)
 |    |    |    |    |    |    |    |-- teleServiceCode: string (nullable = true)
 |    |    |    |    |    |-- chargeInformationList: struct (nullable = true)
 |    |    |    |    |    |    |-- ChargeInformation: struct (nullable = true)
 |    |    |    |    |    |    |    |-- chargeDetailList: struct (nullable = true)
 |    |    |    |    |    |    |    |    |-- ChargeDetail: struct (nullable = true)
 |    |    |    |    |    |    |    |    |    |-- charge: long (nullable = true)
 |    |    |    |    |    |    |    |    |    |-- chargeType: string (nullable = true)
 |    |    |    |    |    |    |    |    |    |-- chargeableUnits: long (nullable = true)
 |    |    |    |    |    |    |    |    |    |-- chargedUnits: long (nullable = true)
 |    |    |    |    |    |    |    |    |    |-- dayCategory: long (nullable = true)
 |    |    |    |    |    |    |    |    |    |-- timeBand: long (nullable = true)
 |    |    |    |    |    |    |    |-- chargedItem: long (nullable = true)
 |    |    |    |    |    |    |    |-- exchangeRateCode: long (nullable = true)
 |    |    |    |-- equipmentInformation: struct (nullable = true)
 |    |    |    |    |-- imeiOrEsn: struct (nullable = true)
 |    |    |    |    |    |-- imei: string (nullable = true)
 |    |    |    |-- locationInformation: struct (nullable = true)
 |    |    |    |    |-- networkLocation: struct (nullable = true)
 |    |    |    |    |    |-- callReference: string (nullable = true)
 |    |    |    |    |    |-- cellId: long (nullable = true)
 |    |    |    |    |    |-- locationArea: long (nullable = true)
 |    |    |    |    |    |-- recEntityCode: long (nullable = true)
 |    |    |    |-- operatorSpecInformation: struct (nullable = true)
 |    |    |    |    |-- OperatorSpecInformation: array (nullable = true)
 |    |    |    |    |    |-- element: string (containsNull = true)
 |-- networkInfo: struct (nullable = true)
 |    |-- calledNumAnalysis: struct (nullable = true)
 |    |    |-- CalledNumAnalysis: struct (nullable = true)
 |    |    |    |-- calledNumAnalysisCode: long (nullable = true)
 |    |    |    |-- countryCodeTable: struct (nullable = true)
 |    |    |    |    |-- CountryCode: string (nullable = true)
 |    |    |    |-- iacTable: struct (nullable = true)
 |    |    |    |    |-- Iac: string (nullable = true)
 |    |-- networkType: long (nullable = true)
 |    |-- recEntityInfo: struct (nullable = true)
 |    |    |-- RecEntityDefinition: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- recEntityCode: long (nullable = true)
 |    |    |    |    |-- recEntityId: struct (nullable = true)
 |    |    |    |    |    |-- gsnaddress: struct (nullable = true)
 |    |    |    |    |    |    |-- iPTextV4Address: string (nullable = true)
 |    |    |    |    |    |-- mscId: string (nullable = true)
 |    |    |    |    |    |-- msisdn: string (nullable = true)
 |    |    |    |    |-- recEntityType: long (nullable = true)
 |    |-- utcTimeOffsetInfo: struct (nullable = true)
 |    |    |-- UtcTimeOffsetDefinition: struct (nullable = true)
 |    |    |    |-- utcTimeOffset: string (nullable = true)
 |    |    |    |-- utcTimeOffsetCode: long (nullable = true)

When I want to see the elements in the dataframe, it is shown in a table like this: table

I'm unsure of how can I write this dataframe to a csv file.

Any advice? Thanks

Please take a look at Spark-csv library from Databricks:

Here is a simple example:

mydf.write.
    format("com.databricks.spark.csv").
    option("header", "true").
    save("out.csv")

You can find it here: https://mvnrepository.com/artifact/com.databricks/spark-csv_2.10

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM