I have multiple Data Types within a defined schema. Trying to find a good way to filter by TimestampType to transform all TimeStampType fields from long to datetime. I'm able to filer using.dtypes within the stream on StringType but run into issues when trying to filter with.dtypes of StructFields and StructTypes. Is there a way to filter only TimestampType's within a Struct? Below is the sudo code I'm using which is in Scala 2.11 using spark structured streaming
val isoDateFormatter = "yyyy-MM-dd'T'HH:mm:ss'Z'"
val ExampleDataFrameLoad = spark
.readStream
.format("kafka")
.option("subscribe", topics.keys.mkString(","))
.options(kafkaConfig)
.load()
.select($"key".cast(StringType), $"value".cast(StringType), $"topic")
// Convert untyped dataframe to dataset
.as[(String, String, String)]
// Merge all manifests for vehicle in minibatch
.groupByKey(_._1)
//Start of merge
.flatMapGroupsWithState(OutputMode.Append, GroupStateTimeout.ProcessingTimeTimeout)(mergeGroup)
// .select($"key".cast(StringType),from_json($"value",schema).as("manifest"))
.select($"_1".alias("key"), $"_2".alias("jsonvalues"))
.select("key", "jsonvalues.*")
val ExampleDataFrame = ExampleDataFrameLoad
ExampleDataFrame.dtypes.foreach(println)
/* Returns
(key,StringType)
(contractVersion,StringType)
(metaData,StructType(StructField(Test,StringType,true), StructField(DateUtc,TimestampType,true)
*/
*Uses the following objects
import java.sql.Timestamp
object ManifestClasses {
final case class ProductManifestDocument(
contractVersion: Option[String],
metaData: DocumentMetaData
)
final case class DocumentMetaData(
Test: Option[String]
DateUtc: Timestamp
)
*/
ExampleDataFrame
//brings back data fields with types
.dtypes
//Currently returning empty but works for StringType
.filter(_._2 == "TimestampType")
.map(_._1)
//Tranforms all timestamp longs to yyyy-MM-dd'T'HH:mm:ss'Z' format
.foldLeft(ExampleDataFrame)((df, colName) => df.withColumn(colName, date_format(col(colName), isoDateFormatter)))
You can convert the structType as below -
Changing the date format of timestamp type in place
val ExampleDataFrame = spark.sql("select key, contractVersion, metaData from values " +
"('k1', 'v1', named_struct('Test', 'test1', 'DateUtc', cast(unix_timestamp() as timestamp))) " +
"T(key, contractVersion, metaData)")
ExampleDataFrame.show(false)
ExampleDataFrame.printSchema()
ExampleDataFrame.dtypes.foreach(println)
/**
* +---+---------------+----------------------------+
* |key|contractVersion|metaData |
* +---+---------------+----------------------------+
* |k1 |v1 |[test1, 2020-06-23 14:39:55]|
* +---+---------------+----------------------------+
*
* root
* |-- key: string (nullable = false)
* |-- contractVersion: string (nullable = false)
* |-- metaData: struct (nullable = false)
* | |-- Test: string (nullable = false)
* | |-- DateUtc: timestamp (nullable = true)
*
* (key,StringType)
* (contractVersion,StringType)
* (metaData,StructType(StructField(Test,StringType,false), StructField(DateUtc,TimestampType,true)))
*/
val isoDateFormatter = "yyyy-MM-dd'T'HH:mm:ss'Z'"
val processedDF = ExampleDataFrame.withColumn("metaData", struct($"metaData.Test",
date_format($"metaData.DateUtc", isoDateFormatter)))
processedDF.show(false)
/**
* +---+---------------+-----------------------------+
* |key|contractVersion|metaData |
* +---+---------------+-----------------------------+
* |k1 |v1 |[test1, 2020-06-23T14:51:17Z]|
* +---+---------------+-----------------------------+
*/
timestamptype
from structType as a separete columnExampleDataFrame.schema
.filter(_.dataType.isInstanceOf[StructType])
.flatMap(s => s.dataType.asInstanceOf[StructType]
.filter(_.dataType == TimestampType)
.map(f => s"${s.name}.${f.name}")
)
.foldLeft(ExampleDataFrame)((df, colName) => df.withColumn(colName, date_format(col(colName), isoDateFormatter)))
.show(false)
/**
* +---+---------------+----------------------------+--------------------+
* |key|contractVersion|metaData |metaData.DateUtc |
* +---+---------------+----------------------------+--------------------+
* |k1 |v1 |[test1, 2020-06-23 21:40:36]|2020-06-23T21:40:36Z|
* +---+---------------+----------------------------+--------------------+
*/
// use df.select("`metaData.DateUtc`") to select the columns having dot(.)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.