I'm pretty new to spark and scala. I'm trying to run a small program to better understand mapPartitioins. However when I run the code below, I get an error that the class isn't serializable. I tried putting the code in a companion class, but that didn't work either. I also tried removing private from everything but that didn't work either. I'm at a loss as to why this isn't working.
import java.sql.Date
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{Dataset, Row, SparkSession}
import scala.collection.mutable
object DataSetTest {
val spark: SparkSession = createSparkContext()
def main(args: Array[String]): Unit ={
import spark.implicits._
val refDate = new Date(System.currentTimeMillis())
val events : Dataset[Row] = Seq(
("Q", "A", 5, 10),
("N", "A", 8, 11)
).toDF("type", "symbol", "birth", "death")
.repartition(col("refDate"), col("symbol"))
.sort(col("birth"))
events.show()
val matches : Dataset[NbboQuoteMatch] = events.mapPartitions(partition =>
return processTInsidePartition(partition)
)
matches.show()
}
val processTInsidePartition = (partition: Iterator[Row]) => {
import spark.implicits._
val results = new mutable.ArrayBuffer[NbboQuoteMatch]
val rdd: RDD[NbboQuoteMatch] = spark.sparkContext.parallelize(results.toList)
spark.createDataset[NbboQuoteMatch](rdd)
}
case class NbboQuoteMatch (val nbbo: Nbbo, val quote: Quote)
trait Event{
def getBirth(): Long
def getDeath() : Long
def getType() : String
}
case class Nbbo( val symbol: String, val birth: Long, val death: Long) extends Event {
override def getBirth(): Long = birth
override def getDeath(): Long = death
override def getType(): String = "Q"
}
case class Quote ( val symbol: String, val birth: Long, val death: Long) extends Event{
override def getDeath(): Long = death
override def getBirth(): Long = birth
override def getType(): String = "N"
}
def createSparkContext(): SparkSession ={
val rv : SparkSession = SparkSession.builder()
.appName("abc")
.config(new SparkConf())
.enableHiveSupport()
.config("spark.master", "local[*]")
.config("hive.exec.dynamic.partition", "true")
.config("hive.exec.dynamic.partition.mode", "nonstrict")
.config("spark.executor.instances", 8).getOrCreate()
return rv;
}
}
Error
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task not serializable: java.io.NotSerializableException: sandbox.etp.DataSetTest$
Serialization stack:
- object not serializable (class: sandbox.etp.DataSetTest$, value: sandbox.etp.DataSetTest$@7d1ccdfe)
- element of array (index: 0)
- array (class [Ljava.lang.Object;, size 2)
- field (class: java.lang.invoke.SerializedLambda, name: capturedArgs, type: class [Ljava.lang.Object;)
- object (class java.lang.invoke.SerializedLambda, SerializedLambda[capturingClass=class sandbox.etp.DataSetTest$, functionalInterfaceMethod=scala/Function1.apply:(Ljava/lang/Object;)Ljava/lang/Object;, implementation=invokeStatic sandbox/etp/DataSetTest$.$anonfun$main$1:(Lsandbox/etp/DataSetTest$;Ljava/lang/Object;Lscala/collection/Iterator;)Lscala/runtime/Nothing$;, instantiatedMethodType=(Lscala/collection/Iterator;)Lscala/runtime/Nothing$;, numCaptured=2])
- writeReplace data (class: java.lang.invoke.SerializedLambda)
- object (class sandbox.etp.DataSetTest$$$Lambda$17/1227659410, sandbox.etp.DataSetTest$$$Lambda$17/1227659410@4d4c4b14)
- field (class: org.apache.spark.sql.execution.MapPartitionsExec, name: func, type: interface scala.Function1)
- object (class org.apache.spark.sql.execution.MapPartitionsExec, MapPartitions sandbox.etp.DataSetTest$$$Lambda$17/1227659410@4d4c4b14, obj#40: sandbox.etp.DataSetTest$NbboQuoteMatch
+- DeserializeToObject createexternalrow(type#11.toString, staticinvoke(class org.apache.spark.sql.catalyst.util.DateTimeUtils$, ObjectType(class java.sql.Date), toJavaDate, refDate#12, true, false), symbol#13.toString, birth#14, death#15, StructField(type,StringType,true), StructField(refDate,DateType,true), StructField(symbol,StringType,true), StructField(birth,IntegerType,false), StructField(death,IntegerType,false)), obj#39: org.apache.spark.sql.Row
+- *(1) Sort [birth#14 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(birth#14 ASC NULLS FIRST, 200)
+- Exchange hashpartitioning(refDate#12, symbol#13, 200)
+- LocalTableScan [type#11, refDate#12, symbol#13, birth#14, death#15]
You are using processTInsidePartition
inside mapPartitions closure.
Try extending Serializable
interface as follows-
object DataSetTest extends Serializable {
...
...
}
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.