[英]Spark: mapPartitions object not serializable
我对 spark 和 scala 很陌生。 我正在尝试运行一个小程序来更好地理解 mapPartitioins。 但是,当我运行下面的代码时,我收到 class 不可序列化的错误。 我尝试将代码放在配套的 class 中,但这也不起作用。 我还尝试从所有内容中删除私有,但这也不起作用。 我不知道为什么这不起作用。
import java.sql.Date
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{Dataset, Row, SparkSession}
import scala.collection.mutable
object DataSetTest {
val spark: SparkSession = createSparkContext()
def main(args: Array[String]): Unit ={
import spark.implicits._
val refDate = new Date(System.currentTimeMillis())
val events : Dataset[Row] = Seq(
("Q", "A", 5, 10),
("N", "A", 8, 11)
).toDF("type", "symbol", "birth", "death")
.repartition(col("refDate"), col("symbol"))
.sort(col("birth"))
events.show()
val matches : Dataset[NbboQuoteMatch] = events.mapPartitions(partition =>
return processTInsidePartition(partition)
)
matches.show()
}
val processTInsidePartition = (partition: Iterator[Row]) => {
import spark.implicits._
val results = new mutable.ArrayBuffer[NbboQuoteMatch]
val rdd: RDD[NbboQuoteMatch] = spark.sparkContext.parallelize(results.toList)
spark.createDataset[NbboQuoteMatch](rdd)
}
case class NbboQuoteMatch (val nbbo: Nbbo, val quote: Quote)
trait Event{
def getBirth(): Long
def getDeath() : Long
def getType() : String
}
case class Nbbo( val symbol: String, val birth: Long, val death: Long) extends Event {
override def getBirth(): Long = birth
override def getDeath(): Long = death
override def getType(): String = "Q"
}
case class Quote ( val symbol: String, val birth: Long, val death: Long) extends Event{
override def getDeath(): Long = death
override def getBirth(): Long = birth
override def getType(): String = "N"
}
def createSparkContext(): SparkSession ={
val rv : SparkSession = SparkSession.builder()
.appName("abc")
.config(new SparkConf())
.enableHiveSupport()
.config("spark.master", "local[*]")
.config("hive.exec.dynamic.partition", "true")
.config("hive.exec.dynamic.partition.mode", "nonstrict")
.config("spark.executor.instances", 8).getOrCreate()
return rv;
}
}
错误
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task not serializable: java.io.NotSerializableException: sandbox.etp.DataSetTest$
Serialization stack:
- object not serializable (class: sandbox.etp.DataSetTest$, value: sandbox.etp.DataSetTest$@7d1ccdfe)
- element of array (index: 0)
- array (class [Ljava.lang.Object;, size 2)
- field (class: java.lang.invoke.SerializedLambda, name: capturedArgs, type: class [Ljava.lang.Object;)
- object (class java.lang.invoke.SerializedLambda, SerializedLambda[capturingClass=class sandbox.etp.DataSetTest$, functionalInterfaceMethod=scala/Function1.apply:(Ljava/lang/Object;)Ljava/lang/Object;, implementation=invokeStatic sandbox/etp/DataSetTest$.$anonfun$main$1:(Lsandbox/etp/DataSetTest$;Ljava/lang/Object;Lscala/collection/Iterator;)Lscala/runtime/Nothing$;, instantiatedMethodType=(Lscala/collection/Iterator;)Lscala/runtime/Nothing$;, numCaptured=2])
- writeReplace data (class: java.lang.invoke.SerializedLambda)
- object (class sandbox.etp.DataSetTest$$$Lambda$17/1227659410, sandbox.etp.DataSetTest$$$Lambda$17/1227659410@4d4c4b14)
- field (class: org.apache.spark.sql.execution.MapPartitionsExec, name: func, type: interface scala.Function1)
- object (class org.apache.spark.sql.execution.MapPartitionsExec, MapPartitions sandbox.etp.DataSetTest$$$Lambda$17/1227659410@4d4c4b14, obj#40: sandbox.etp.DataSetTest$NbboQuoteMatch
+- DeserializeToObject createexternalrow(type#11.toString, staticinvoke(class org.apache.spark.sql.catalyst.util.DateTimeUtils$, ObjectType(class java.sql.Date), toJavaDate, refDate#12, true, false), symbol#13.toString, birth#14, death#15, StructField(type,StringType,true), StructField(refDate,DateType,true), StructField(symbol,StringType,true), StructField(birth,IntegerType,false), StructField(death,IntegerType,false)), obj#39: org.apache.spark.sql.Row
+- *(1) Sort [birth#14 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(birth#14 ASC NULLS FIRST, 200)
+- Exchange hashpartitioning(refDate#12, symbol#13, 200)
+- LocalTableScan [type#11, refDate#12, symbol#13, birth#14, death#15]
您在 mapPartitions 闭包中使用processTInsidePartition
。
尝试扩展Serializable
接口如下-
object DataSetTest extends Serializable {
...
...
}
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.