簡體   English   中英

Spark無法使用過濾器中的UDF序列化任務

[英]Spark is not able the serialize the task, using UDF in filter

線程“主” org.apache.spark.SparkException中的異常:無法在org.apache.spark.util.ClosureCleaner $ .org上的org.apache.spark.util.ClosureCleaner $ .ensureSerializable(ClosureCleaner.scala:304)進行序列化的任務org.apache.spark.util.ClosureCleaner $ .clean(ClosureCleaner.scala:122)上的$ apache $ spark $ util $ ClosureCleaner $$ clean(ClosureCleaner.scala:294)在org.apache.spark.SparkContext.clean(SparkContext .scala:2055)位於org.apache.spark.SparkContext.runJob(SparkContext.scala:1857)位於org.apache.spark.SparkContext.runJob(SparkContext.scala:1929)位於org.apache.spark.rdd.RDD $位於org.apache.spark.rdd.RDDOperationScope $ .withScope(RDDOperationScope.scala:150)的$ anonfun $ collect $ 1.apply(RDD.scala:927)位於org.apache.spark.rdd.RDDOperationScope $ .withScope(RDDOperationScope。 scala:111)位於org.apache.spark.rdd.RDD.withScope(RDD.scala:316)位於org.apache.spark.rdd.RDD.collect(RDD.scala:926)位於org.exadatum.ddq.constraints .DateFormatConstraint $$ anonfun $ 2.apply(DateFormatConstraint.scala:32)在org.exadatum.d org上的dq.constraints.DateFormatConstraint $$ anonfun $ 2.apply(DateFormatConstraint.scala:16)在org.exadatum.ddq.core.Runner $ org。$$ anonfun $ run $ 1 $$ anonfun $ 3.apply(Runner.scala:22)在org。在scala.collection的exadatum.ddq.core.Runner $$ anonfun $ run $ 1 $ anonfun $ 3.apply(Runner.scala:22)在scala處使用.TraversableLike $$ anonfun $ map $ 1.apply(TraversableLike.scala:244)。在scala.collection.immutable.List.foreach(List.scala:318)處的scala.collection.TraversableLike $ class.map(TraversableLike.scala:處的collection.TraversableLike $$ anonfun $ map $ 1.apply(TraversableLike.scala:244) 244)在scala.collection.AbstractTraversable.map(Traversable.scala:105)在org.exadatum.ddq.core.Runner $$ anonfun $ run $ 1.apply(Runner.scala:22)在org.exadatum.ddq.core .scala.collection上的.Runner $$ anonfun $ run $ 1.apply(Runner.scala:20)在scala.collection.TraversableLike $$ anonfun $ map $ 1.apply(TraversableLike.scala:244)在scala.collection.TraversableLike $$ anonfun $ map $ 1在scala.collection.immutable.List.foreach(List.scala:318)處在scala.collection處的.apply(TraversableLike.scala:244) 在scala.collection.AbstractTraversable.map(Traversable.scala:105)在org.exadatum.ddq.core.Runner $ .run(Runner.scala:20)在scala.collection.AbstractTraversable.map(Traversable.scala:105) org.exadatum.ddq.core.DQJobTrigger $ .main(DQJobTrigger.scala:39)上的exadatum.ddq.core.RunCheck。(RunCheck.scala:104)org.exadatum.ddq.core.DQJobTrigger.main(DQJobTrigger。 scala)位於sun.reflect.NativeMethodAccessorImpl.invoke0(本地方法)位於sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)位於sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)在java.lang.reflect在org.apache.spark.deploy上的.Method.invoke(Method.java:498)在org.apache.spark.deploy.SparkSubmit $ .org $ apache $ spark $ deploy $ SparkSubmit $$ runMain(SparkSubmit.scala:731) org.apache.spark.deploy.SparkSubmit $ .submit(SparkSubmit.scala:206)上的.SparkSubmit $ .doRunMain $ 1(SparkSubmit.scala:181)在org.apache.spark.deploy.SparkSubmit $ .main(SparkSubmit.scala) :121),網址為org.apache.spark.deploy.SparkSub mit.main(SparkSubmit.scala)原因:java.io.NotSerializableException:org.apache.spark.SparkContext序列化堆棧:-無法序列化的對象(類:org.apache.spark.SparkContext,值:org.apache.spark。 SparkContext @ 1d9bd4d6)-字段(類:org.exadatum.ddq.constraints.DateFormatConstraint,名稱:sc,類型:類org.apache.spark.SparkContext)-對象(類org.exadatum.ddq.constraints.DateFormatConstraint,DateFormatConstraint( startdate,java.text.SimpleDateFormat @ 4f76f1a0,org.apache.spark.SparkContext @ 1d9bd4d6,xdqdemo.customer_details))-字段(class:org.exadatum.ddq.constraints.DateFormatConstraint $$ anonfun $ 2,名稱:$ outer, :class org.exadatum.ddq.constraints.DateFormatConstraint)-對象(class org.exadatum.ddq.constraints.DateFormatConstraint $$ anonfun $ 2,)-字段(class:org.exadatum.ddq.constraints.DateFormatConstraint $$ anonfun $ 2 $ $ anonfun $ 3,名稱:$ outer,類型:class org.exadatum.ddq.constraints.DateFormatConstraint $$ anonfun $ 2)-對象(org.exadatum.ddq類 .constraints.DateFormatConstraint $$ anonfun $ 2 $$ anonfun $ 3,)-字段(類:org.apache.spark.sql.catalyst.expressions.ScalaUDF $$ anonfun $ 2,名稱:func $ 2,類型:interface scala.Function1)-對象(類org.apache.spark.sql.catalyst.expressions.ScalaUDF $$ anonfun $ 2,)-字段(類:org.apache.spark.sql.catalyst.expressions.ScalaUDF,名稱:f,類型:接口scala。 Function1)-對象(類org.apache.spark.sql.catalyst.expressions.ScalaUDF,UDF(startdate#2))-writeObject數據(類:scala.collection.immutable。$ colon $ colon)-對象(scala類。 collection.immutable。$ colon $ colon,List(UDF(​​startdate#2)))-字段(類:org.apache.spark.sql.execution.columnar.InMemoryColumnarTableScan,名稱:謂詞,類型:接口scala.collection.Seq )-對象(類org.apache.spark.sql.execution.columnar.InMemoryColumnarTableScan,InMemoryColumnarTableScan [phone_number#0,名稱#1,startdate#2],[UDF(startdate#2)],InMemoryRelation [phone_number#0,名稱#1,startdate#2],true,10000,StorageLevel(錯誤 e,true,false,true,1),ConvertToUnsafe,None)-字段(類:org.apache.spark.sql.execution.columnar.InMemoryColumnarTableScan $$ anonfun $ doExecute $ 1,名稱:$ outer,類型:org類。 apache.spark.sql.execution.columnar.InMemoryColumnarTableScan)-對象(類org.apache.spark.sql.execution.columnar.InMemoryColumnarTableScan $$ anonfun $ doExecute $ 1,)-字段(類:org.apache.spark.rdd。 RDD $$ anonfun $ mapPartitionsInternal $ 1,名稱:f $ 22,類型:interface scala.Function1)-對象(類org.apache.spark.rdd.RDD $$ anonfun $ mapPartitionsInternal $ 1,)-字段(類:org.apache。 spark.rdd.RDD $$ anonfun $ mapPartitionsInternal $ 1 $$ anonfun $ apply $ 21,名稱:$ outer,類型:class org.apache.spark.rdd.RDD $$ anonfun $ mapPartitionsInternal $ 1)-對象(org.apache類。 spark.rdd.RDD $$ anonfun $ mapPartitionsInternal $ 1 $$ anonfun $ apply $ 21,)-字段(類:org.apache.spark.rdd.MapPartitionsRDD,名稱:f,類型:interface scala.Function3)-對象(org類) .apache.spark.rdd.MapPartitionsRDD,MapPartitionsRDD [8] d at DateFormatConstraint.scala:32)-字段(類:org.apache.spark.NarrowDependency,名稱: rdd,類型:org.apache.spark.rdd.RDD類)-對象(類org.apache.spark.OneToOneDependency, org.apache.spark.OneToOneDependency@316975be)-writeObject數據(類:scala.collection.immutable。$ colon $ colon)-對象(類scala.collection.immutable。$ colon $ colon,List(org.apache.spark。 OneToOneDependency @ 316975be))-字段(類:org.apache.spark.rdd.RDD,名稱:org $ apache $ spark $ rdd $ RDD $$ dependencies ,類型:interface scala.collection.Seq)-對象(類org。 apache.spark.rdd.MapPartitionsRDD,MapPartitionsRDD [9](位於rdd,位於DateFormatConstraint.scala:32)-字段(類:org.apache.spark.NarrowDependency,名稱: rdd,類型:類org.apache.spark.rdd.RDD )-對象(類org.apache.spark.OneToOneDependency,org.apache.spark.OneToOneDependency@526fbb80)-writeObject數據(類:scala.collection.immutable。$ colon $ colon)-對象(類scala.collection.immutable。 $ colon $ colon,列表(org.apache.spark .OneToOneDependency @ 526fbb80))-字段(類:org.apache.spark.rdd.RDD,名稱:org $ apache $ spark $ rdd $ RDD $$ dependencies ,類型:interface scala.collection.Seq)-對象(類org .apache.spark.rdd.MapPartitionsRDD,MapPartitionsRDD [10](位於rdd,位於DateFormatConstraint.scala:32)-字段(類:org.apache.spark.rdd.RDD $$ anonfun $ collect $ 1,名稱:$ outer,類型:類org.apache.spark.rdd.RDD)-對象(類org.apache.spark.rdd.RDD $$ anonfun $ collect $ 1,)-字段(類:org.apache.spark.rdd.RDD $$ anonfun $ collect $ 1 $$ anonfun $ 12,名稱:$ outer,類型:class org.apache.spark.rdd.RDD $$ anonfun $ collect $ 1)-對象(org.apache.spark.rdd.RDD $ anonfun $ collect $ 1類在org.apache.spark.serializer.SerializationDebugger $ .improveException(SerializationDebugger.scala:40)處的org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)的$$ anonfun $ 12,)。位於org.apache.spark.util.ClosureCleaner $ .ensu的spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101) reSerializable(ClosureCleaner.scala:301)...還有39個

代碼片段:

val fun = (df: DataFrame) => {

format.setLenient(false)
val cannotBeDate = udf((column: String) => column != null && Try(format.parse(column)).isFailure)
val maybeCannotBeDateCount = Try(df.filter(cannotBeDate(new Column(columnName))).count);

/** Utility to persist all of the bad records   **/

val hiveContext = new HiveContext(sc)
import hiveContext.implicits._

//Writing all Bad records
//val intermediateYriteToHiveDf = df.filter(cannotBeDate(new Column(columnName)))
val writeToHiveDf = df.filter(cannotBeDate(new Column(columnName)))

var recordLists = new ListBuffer[List[(String, String, String)]]()
writeToHiveDf.rdd.collect().foreach {
  row =>
    val item = row.mkString("-")
    val recordList: List[(String, String, String)] = List(List(tableName, "ALWAYS_NULL_CONSTRAINT", item))
      .map { case List(a, b, c) => (a, b, c) }
    recordLists += recordList
}
val listRDD = sc.parallelize(recordLists.flatten)
val dataFrameToHive: DataFrame = listRDD.toDF("table_name", "constraint_applied", "data")
dataFrameToHive.write.mode("append").saveAsTable("xdqdemo.bad_records")



DateFormatConstraintResult(
  this,
  data = maybeCannotBeDateCount.toOption.map(DateFormatConstraintResultData),
  status = ConstraintUtil.tryToStatus[Long](maybeCannotBeDateCount, _ == 0)
)

}

 object checkConstraint extends Serializable{
  def checkDateFormat(format: SimpleDateFormat,df: DataFrame): DataFrame = {
    format.setLenient(false)
    val checkDateFormat = (column: String) => Try(format.parse(column)).isFailure
    val cannotBeDate = udf((column: String) => column != null && checkDateFormat(column))
    df.filter(cannotBeDate(new Column(columnName)))
  }
}


val writeToHiveDf = checkConstraint.checkDateFormat(format,df)

因此,所有計算都打包在一個單例對象中,該對象返回所需的數據幀

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM