[英]No such method running forEach in Scala job on Apache Spark
[英]Scala object apply method never called in Spark Job
我試圖在spark應用程序中解耦我的邏輯。 我為UDF定義和UDF聲明創建了單獨的類:
UDF宣言:
import OPXUdfDefinitions._
object OPXUdf extends Serializable {
def apply(argsInput: Map[String, String]) = {
OPXUdfDefinitions(argsInput)
}
val myUDF = udf(myDef _)
}
UDF定義:
object OPXUdfDefinitions extends Serializable{
private var args: Map[String, String] = _
def apply(argsInput: Map[String, String]) = {
args = argsInput
println("Inside Apply UDFDef" + args) // This never called
}
lazy val path = args.getOrElse(PATH, "/path/") // Here is a NullPointerException
lazy val myCustomObj = new MyClass(path)
def myDef(a: Integer, b: Integer) = {
myCustomObj.getValue(util.Arrays.asList(a, b))
}
}
然后我有一個調用UDF的類 :
import OPXUdf._
class Tasks(args: Map[String, String]){
def getValue()(df: DataFrame): DataFrame = {
df.withColumn("Value", myUDF(col("Num1"), col("Num2")))
}
}
然后我有一個ETL作業和定義的抽象類 :
case class EtlDefinition(
sourceDF: DataFrame,
transform: (DataFrame => DataFrame),
write: (DataFrame => Unit),
metadata: scala.collection.mutable.Map[String, Any] = scala.collection.mutable.Map[String, Any]()
) extends Serializable {
def process(): Unit = {
write(sourceDF.transform(transform))
}
}
abstract class ETLJob extends Serializable{
def read(): DataFrame
def transform()(df: DataFrame): DataFrame
def write()(df:DataFrame): Unit
def execute(): Unit ={
// Define a base ETL Job for all the usecases
val etl = new EtlDefinition(
sourceDF = read(),
transform = transform(),
write = write()
)
// Execute the job
etl.process()
}
}
然后我有我的ETL工作 :
class MyETLJob(spark: SparkSession, args: Map[String, String]) extends ETLJob {
val files = args.getOrElse(INPUT_PATH, null).split(",")
val outputPath = args.getOrElse(OUTPUT_PATH, null)
override def read(): DataFrame = {
val inputDF = spark.read.
format("com.databricks.spark.avro").
load(files: _*)
inputDF
}
override def transform()(df: DataFrame): DataFrame = {
val tasks = new Tasks(args)
val transformedDF = df
.transform(tasks.getValue())
transformedDF
}
override def write()(df: DataFrame): Unit = {
df
.write
.mode("overwrite")
.parquet(outputPath)
}
}
然后在我的主要課程中我打電話:
object Main {
def main(args: Array[String]): Unit = {
// Parse the arguments and create a map
val argParams = ArgumentParser.parse(args)
println("ParamsMap: " + argParams) // I am getting the map here
OPXUdfDefinitions(argParams)
OPXUdf(argParams)
val conf = new SparkConf().
set("spark.sql.shuffle.partitions", "100").
set("parquet.enable.summary-metadata", "true")
val spark = SparkSession.
builder().
appName("My App").
config(conf).
getOrCreate()
val etlJob = new MyETLJob(spark, argParams)
etlJob.execute()
}
但是我的OPXUdfDefinitions應用從未被調用過,每當myUDF被調用時,我總是得到空指針異常 。 我嘗試使用惰性但仍為空指針異常初始化路徑變量。
這是堆棧跟蹤:
org.apache.spark.SparkException: Failed to execute user defined function(anonfun$myUDF$1: (int, int) => string)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.ScalaUDF_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.subExpr_3$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
at org.apache.spark.sql.execution.UnsafeExternalRowSorter.sort(UnsafeExternalRowSorter.java:217)
at org.apache.spark.sql.execution.SortExec$$anonfun$1.apply(SortExec.scala:108)
at org.apache.spark.sql.execution.SortExec$$anonfun$1.apply(SortExec.scala:101)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.NullPointerException
at com.myjob.app.OPXUdfDefinitions$.path$lzycompute(OPXUdfDefinitions.scala:37)
at com.myjob.app.OPXUdfDefinitions$.path(OPXUdfDefinitions.scala:37)
at com.myjob.app.OPXUdfDefinitions$.myCustomObj$lzycompute(OPXUdfDefinitions.scala:38)
at com.myjob.app.OPXUdfDefinitions$.myCustomObj(OPXUdfDefinitions.scala:38)
at com.myjob.app.OPXUdfDefinitions$.myDef(OPXUdfDefinitions.scala:45)
at com.myjob.app.OPXUdf$$anonfun$myUDF$1.apply(OPXUdf.scala:19)
at com.myjob.app.OPXUdf$$anonfun$myUDF$1.apply(OPXUdf.scala:19)
如果有人能夠解釋我做錯了什么以及如何糾正它會很棒。 謝謝。
問題是內部OPXUdfDefinitions
,因為即使path
是lazy
,您呼叫的args
在初始化之前apply
的方法,當調用println("UDFDef" + args)
。
如果刪除println(args)
,則不應再遇到NullPointerException
,如下所示:
object OPXUdfDefinitions extends Serializable{
private var args: Map[String, String] = _
def apply(argsInput: Map[String, String]) = {
args = argsInput
println("Inside Apply UDFDef" + args) // This never called
}
//println("UDFDef" + args) // this is initializing args
lazy val path = args.getOrElse(PATH, "/path/")
lazy val myCustomObj = new MyClass(path)
//println("UDFDef" + args) // same here
def myDef(a: Integer, b: Integer) = {
myCustomObj.getValue(util.Arrays.asList(a, b))
}
}
你能試一下嗎?:
def myDef(a: Integer, b: Integer): String = {
和
val myUDF = udf((a: Integer, b: Integer) => myDef(a,b))
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.