[英]scala.MatchError during Spark 2.0.2 DataFrame union
我正在尝试使用union函数合并2个DataFrame,一个包含旧数据,另一个包含新数据。 这曾经工作,直到我尝试动态地向旧DataFrame添加新字段,因为我的架构正在发展。
这意味着我的旧数据将丢失一个字段,新数据将拥有它。 为了使联合起作用,我将使用下面的evolveSchema函数添加该字段。
这导致输出/异常我粘贴在代码下面,包括我的调试打印。
列排序和制作字段可以通过使DataFrame尽可能相同来尝试解决此问题,但它仍然存在。 模式打印显示在这些操作之后它们看起来都是相同的。
任何帮助进一步调试这将是值得赞赏的。
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.sql.{DataFrame, SQLContext}
object Merger {
def apply(sqlContext: SQLContext, oldDataSet: Option[DataFrame], newEnrichments: Option[DataFrame]): Option[DataFrame] = {
(oldDataSet, newEnrichments) match {
case (None, None) => None
case (None, _) => newEnrichments
case (Some(existing), None) => Some(existing)
case (Some(existing), Some(news)) => Some {
val evolvedOldDataSet = evolveSchema(existing)
println("EVOLVED OLD SCHEMA FIELD NAMES:" + evolvedOldDataSet.schema.fieldNames.mkString(","))
println("NEW SCHEMA FIELD NAMES:" + news.schema.fieldNames.mkString(","))
println("EVOLVED OLD SCHEMA FIELD TYPES:" + evolvedOldDataSet.schema.fields.map(_.dataType).mkString(","))
println("NEW SCHEMA FIELD TYPES:" + news.schema.fields.map(_.dataType).mkString(","))
println("OLD SCHEMA")
existing.printSchema();
println("PRINT EVOLVED OLD SCHEMA")
evolvedOldDataSet.printSchema()
println("PRINT NEW SCHEMA")
news.printSchema()
val nullableEvolvedOldDataSet = setNullableTrue(evolvedOldDataSet)
val nullableNews = setNullableTrue(news)
println("NULLABLE EVOLVED OLD")
nullableEvolvedOldDataSet.printSchema()
println("NULLABLE NEW")
nullableNews.printSchema()
val unionData =nullableEvolvedOldDataSet.union(nullableNews)
val result = unionData.sort(
unionData("timestamp").desc
).dropDuplicates(
Seq("id")
)
result.cache()
}
}
}
def GENRE_FIELD : String = "station_genre"
// Handle missing fields in old data
def evolveSchema(oldDataSet: DataFrame): DataFrame = {
if (!oldDataSet.schema.fieldNames.contains(GENRE_FIELD)) {
val columnAdded = oldDataSet.withColumn(GENRE_FIELD, lit("N/A"))
// Columns should be in the same order for union
val columnNamesInOrder = Seq("id", "station_id", "station_name", "station_timezone", "station_genre", "publisher_id", "publisher_name", "group_id", "group_name", "timestamp")
val reorderedColumns = columnAdded.select(columnNamesInOrder.head, columnNamesInOrder.tail: _*)
reorderedColumns
}
else
oldDataSet
}
def setNullableTrue(df: DataFrame) : DataFrame = {
// get schema
val schema = df.schema
// create new schema with all fields nullable
val newSchema = StructType(schema.map {
case StructField(columnName, dataType, _, metaData) => StructField( columnName, dataType, nullable = true, metaData)
})
// apply new schema
df.sqlContext.createDataFrame( df.rdd, newSchema )
}
}
EVOLVED OLD SCHEMA字段名称:id,station_id,station_name,station_timezone,station_genre,publisher_id,publisher_name,group_id,group_name,timestamp
新SCHEMA字段名称:id,station_id,station_name,station_timezone,station_genre,publisher_id,publisher_name,group_id,group_name,timestamp
EVOLVED OLD SCHEMA字段类型:StringType,LongType,StringType,StringType,StringType,LongType,StringType,LongType,StringType,LongType
NEW SCHEMA FIELD TYPES:StringType,LongType,StringType,StringType,StringType,LongType,StringType,LongType,StringType,LongType
OLD SCHEMA root | - id:string(nullable = true)| - station_id:long(nullable = true)| - station_name:string(nullable = true)| - station_timezone:string(nullable = true)| - publisher_id:long(nullable = true)| - publisher_name:string(nullable = true)| - group_id:long(nullable = true)| - group_name:string(nullable = true)| - timestamp:long(nullable =真正)
PRINT EVOLVED OLD SCHEMA root | - id:string(nullable = true)| - station_id:long(nullable = true)| - station_name:string(nullable = true)| - station_timezone:string(nullable = true)| - station_genre:string(nullable = false)| - publisher_id:long(nullable = true)| - publisher_name:string(nullable = true)| - group_id:long(nullable = true)| - group_name:string( nullable = true)| - timestamp:long(nullable = true)
PRINT NEW SCHEMA root | - id:string(nullable = true)| - station_id:long(nullable = true)| - station_name:string(nullable = true)| - station_timezone:string(nullable = true)| - - station_genre:string(nullable = true)| - publisher_id:long(nullable = true)| - publisher_name:string(nullable = true)| - group_id:long(nullable = true)| - group_name:string(可空) = true)| - timestamp:long(nullable = true)
NULLABLE EVOLVED OLD root | - id:string(nullable = true)| - station_id:long(nullable = true)| - station_name:string(nullable = true)| - station_timezone:string(nullable = true)| - - station_genre:string(nullable = true)| - publisher_id:long(nullable = true)| - publisher_name:string(nullable = true)| - group_id:long(nullable = true)| - group_name:string(可空) = true)| - timestamp:long(nullable = true)
NULLABLE NEW root | - id:string(nullable = true)| - station_id:long(nullable = true)| - station_name:string(nullable = true)| - station_timezone:string(nullable = true)| - station_genre:string(nullable = true)| - publisher_id:long(nullable = true)| - publisher_name:string(nullable = true)| - group_id:long(nullable = true)| - group_name:string(nullable = true)| - timestamp:long(nullable = true)
2017-01-18 15:59:32错误org.apache.spark.internal.Logging $ class Executor:91 - 阶段2.0(TID 4)任务1.0中的异常scala.MatchError:false(类java.lang.Boolean) )org.apache.spark.sql.catalyst.CatalystTypeConverters $ StringConverter $ .toCatalystImpl(CatalystTypeConverters.scala:296)at at
...
com.companystuff.meta.uploader.Merger $。适用(Merger.scala:49)
...
引发者:scala.MatchError:false(类java.lang.Boolean)at org.apache.spark.sql.catalyst.CatalystTypeConverters $ StringConverter $ .toCatalystImpl(CatalystTypeConverters.scala:296)...
这是因为即使其架构相同,也会对实际数据进行排序。 因此,只需选择所有必需的列,然后执行联合查询。
像这样的东西:
val columns:Seq[String]= ....
val df = oldDf.select(columns:_*).union(newDf.select(columns:_*)
希望它能帮到你
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.