I want to create an rdd of a collection and eventually store that in a dataframe that I can write back into a csv.
Here is my code:
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{
StructType, StructField, StringType, IntegerType}
import org.apache.spark.sql.Row
object Main extends Serializable {
class Input_Class extends Serializable {
var name: String = "";
var age: String = "";
var gender: String = "";
def setter(src: Row) {
var row = src.toSeq
var i = 0;
name = row(i).toString;
i += 1;
age = row(i).toString;
i += 1;
gender = row(i).toString;
}
def printing(row: Row) {
println(name, age, gender)
}
}
class Output_Class extends Serializable {
var name: String = "";
var age: Int = 0;
var gender: String = "";
var marks: Int = 0;
def writer(){
val data = Array(name,age,gender,marks)
val distData = spark.sparkContext.parallelize(data)
distData.collect().foreach(println)
}
}
class Manager extends Serializable {
var inputObj = new Input_Class();
var outputObj = new Output_Class();
def inputSetter(src: Row) = {
inputObj.setter(src);
}
def transformation(){
outputObj.age = inputObj.age.toInt;
outputObj.name = inputObj.name;
outputObj.gender = inputObj.gender;
outputObj.marks = 100;
outputwriter();
}
def outputwriter(){
outputObj.writer();
}
}
def main(args: Array[String]) {
val spark = SparkSession.builder.appName("App").config("spark.master", "local").getOrCreate()
import spark.implicits._
val df = spark.read.csv("/Users/kagarwa9/Desktop/data.csv ");
df.rdd.map(row => {
var ManagerObj = new Manager();
ManagerObj.inputSetter(row)
ManagerObj.transformation();
}).collect();
spark.stop()
}
}
I need to create the classes for the input and output, it is a business need. The issue is arising in the writer method of the Output Class.
The error that I am getting is as follows:
20/06/04 16:45:55 WARN SparkSession$Builder: Using an existing SparkSession; some configuration may not take effect.
20/06/04 16:45:58 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1)
java.lang.NullPointerException
at $line14.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$Main$Output_Class.writer(Test.scala:60)
at $line14.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$Main$Manager.outputwriter(Test.scala:85)
at $line14.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$Main$Manager.transformation(Test.scala:80)
at $line14.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$Main$.$anonfun$main$1(Test.scala:96)
at $line14.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$Main$.$anonfun$main$1$adapted(Test.scala:94)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
at scala.collection.Iterator.foreach(Iterator.scala:941)
at scala.collection.Iterator.foreach$(Iterator.scala:941)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
at scala.collection.TraversableOnce.to(TraversableOnce.scala:313)
at scala.collection.TraversableOnce.to$(TraversableOnce.scala:311)
at scala.collection.AbstractIterator.to(Iterator.scala:1429)
at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:305)
at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:305)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1429)
at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:292)
at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:286)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1429)
at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:945)
at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2101)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:411)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
20/06/04 16:45:58 WARN TaskSetManager: Lost task 0.0 in stage 1.0 (TID 1, localhost, executor driver): java.lang.NullPointerException
at $line14.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$Main$Output_Class.writer(Test.scala:60)
at $line14.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$Main$Manager.outputwriter(Test.scala:85)
at $line14.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$Main$Manager.transformation(Test.scala:80)
at $line14.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$Main$.$anonfun$main$1(Test.scala:96)
at $line14.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$Main$.$anonfun$main$1$adapted(Test.scala:94)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
at scala.collection.Iterator.foreach(Iterator.scala:941)
at scala.collection.Iterator.foreach$(Iterator.scala:941)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
at scala.collection.TraversableOnce.to(TraversableOnce.scala:313)
at scala.collection.TraversableOnce.to$(TraversableOnce.scala:311)
at scala.collection.AbstractIterator.to(Iterator.scala:1429)
at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:305)
at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:305)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1429)
at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:292)
at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:286)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1429)
at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:945)
at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2101)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:411)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
20/06/04 16:45:58 ERROR TaskSetManager: Task 0 in stage 1.0 failed 1 times; aborting job
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 1 times, most recent failure: Lost task 0.0 in stage 1.0 (TID 1, localhost, executor driver): java.lang.NullPointerException
at Main$Output_Class.writer(Test.scala:60)
at Main$Manager.outputwriter(Test.scala:85)
at Main$Manager.transformation(Test.scala:80)
at Main$.$anonfun$main$1(Test.scala:96)
at Main$.$anonfun$main$1$adapted(Test.scala:94)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
at scala.collection.Iterator.foreach(Iterator.scala:941)
at scala.collection.Iterator.foreach$(Iterator.scala:941)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
at scala.collection.TraversableOnce.to(TraversableOnce.scala:313)
at scala.collection.TraversableOnce.to$(TraversableOnce.scala:311)
at scala.collection.AbstractIterator.to(Iterator.scala:1429)
at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:305)
at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:305)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1429)
at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:292)
at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:286)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1429)
at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:945)
at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2101)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:411)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:1889)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1877)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1876)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:926)
at scala.Option.foreach(Option.scala:274)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:945)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
at Main$.main(Test.scala:94)
... 51 elided
Caused by: java.lang.NullPointerException
at Main$Output_Class.writer(Test.scala:60)
at Main$Manager.outputwriter(Test.scala:85)
at Main$Manager.transformation(Test.scala:80)
at Main$.$anonfun$main$1(Test.scala:96)
at Main$.$anonfun$main$1$adapted(Test.scala:94)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
at scala.collection.Iterator.foreach(Iterator.scala:941)
at scala.collection.Iterator.foreach$(Iterator.scala:941)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
at scala.collection.TraversableOnce.to(TraversableOnce.scala:313)
at scala.collection.TraversableOnce.to$(TraversableOnce.scala:311)
at scala.collection.AbstractIterator.to(Iterator.scala:1429)
at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:305)
at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:305)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1429)
at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:292)
at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:286)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1429)
at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:945)
at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2101)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:411)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Please help me out here I am a newbie and don't have much experience on this.
You can't pass spark context inside rdd map function, like you do in
.map(row => {
...
ManagerObj.transformation(spark);
})
Instead, if you need to save your data, first do the transformation, and then collect it, or save to some storage.
Also, function inside df.map
is being executed on the worker nodes, not on the driver, so ManagerObj
instance you've created also won't be available there.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.