![](/img/trans.png)
[英]org.apache.spark.SparkException: Task not serializable in Spark Scala
[英]org.apache.spark.SparkException: Task not serializable (scala)
我是scala和FOR spark的新手,請幫助我解決此問題。 在spark shell中,當我分別加載以下函數時,它們會正常運行,當我在scala對象中復制此函數,並在spark shell中加載相同的文件時,在嘗試並行化時,它們會在“ processbatch”函數中引發任務而不是序列化異常。 PFB代碼相同:
import org.apache.spark.sql.Row
import org.apache.log4j.Logger
import org.apache.spark.sql.hive.HiveContext
object Process {
val hc = new HiveContext(sc)
def processsingle(wait: Int, patient: org.apache.spark.sql.Row, visits: Array[org.apache.spark.sql.Row]) : String = {
var out = new StringBuilder()
val processStart = getTimeInMillis()
for( x <- visits ) {
out.append(", " + x.getAs("patientid") + ":" + x.getAs("visitid"))
}
}
def processbatch(batch: Int, wait: Int, patients: Array[org.apache.spark.sql.Row], visits: Array[org.apache.spark.sql.Row]) = {
val out = sc.parallelize(patients, batch).map( r=> processsingle(wait, r, visits.filter(f=> f.getAs("patientid") == r.getAs("patientid")))).collect()
for(x <- out) println(x)
}
def processmeasures(fetch: Int, batch: Int, wait: Int) = {
val patients = hc.sql("SELECT patientid FROM tableName1 order by p_id").collect()
val visit = hc.sql("SELECT patientid, visitid FROM tableName2")
val count = patients.length
val fetches = if(count % fetch > 0) (count / fetch + 1) else (count / fetch)
for(i <- 0 to fetches.toInt-1){
val startFetch = i*fetch
val endFetch = math.min((i+1)*fetch, count.toInt)-1
val fetchSize = endFetch - startFetch + 1
val fetchClause = "patientid >= " + patients(startFetch).get(0) + " and patientid <= " + patients(endFetch).get(0)
val fetchVisit = visit.filter( fetchClause ).collect()
val batches = if(fetchSize % batch > 0) (fetchSize / batch + 1) else (fetchSize / batch)
for(j <- 0 to batches.toInt-1){
val startBatch = j*batch
val endBatch = math.min((j+1)*batch, fetch.toInt)-1
println(s"Batch from $startBatch to $endBatch");
val batchVisits = fetchVisit.filter(g => g.getAs[Long]("patientid") >= patients(i*fetch + startBatch).getLong(0) && g.getAs[Long]("patientid") <= patients(math.min(i*fetch + endBatch + 1, endFetch)).getLong(0))
processbatch(batch, wait, patients.slice(i*fetch + startBatch, i*fetch + endBatch + 1), batchVisits)
}
}
println("Processing took " + getExecutionTime(processStart) + " millis")
}
}
您應該使Process
對象可Serializable
:
object Process extends Serializable {
...
}
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.