[英]java.lang.OutOfMemoryError: Java heap space in spark application
I am running a spark application that reads message from a very large (~7M) table, processes the message and writes the result back to the same table. 我正在运行一个spark应用程序,它从一个非常大的(~7M)表读取消息,处理消息并将结果写回同一个表。 The application shall work fine with small table however on this table I get the error as mentioned above.
应用程序应该可以正常使用小表,但是在这个表上我得到了如上所述的错误。
Code 码
package com.vocp.userProfile
import java.util.Arrays
import java.util.ArrayList
import gate.util.GateException
import java.net.MalformedURLException
import org.codehaus.jettison.json.JSONArray
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.HConstants
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.mapreduce.{MultiTableOutputFormat, TableOutputFormat}
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.KeyValue.Type
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import java.util.ArrayList
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.log4j.{Level, Logger}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.codehaus.jettison.json.{JSONArray, JSONObject}
import com.vocp.userProfile.UserProfile.{Utility, controller_java, pipeline}
import com.vocp.userProfile.Configuration.{VOCPConstants, VocpConfiguration}
import org.apache.hadoop.hbase.client.Scan
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.util.{Base64, Bytes}
/**
* Created by sahil on 12/4/17.
*/
class runner {
var log: Logger = Logger.getLogger(classOf[runner])
val conf = VocpConfiguration.create()
}
object runner {
val run = new runner
val conf = run.conf
val log = run.log
@throws(classOf[Exception])
def nlpAnnotationExtraction(batchString: String): Int = {
log.info("In Main Object..")
//Initializing Spark Context
val sc = new SparkContext(new SparkConf().setAppName("TheExorcist"))
sc.addFile(conf.get(VOCPConstants.GATE_PLUGIN_ARCHIVE), true)
val batchId =
if (batchString == "newbatch")
java.lang.Long.toString(System.currentTimeMillis())
else batchString
conf.set("batchId", batchId)
try {
conf.set(TableInputFormat.INPUT_TABLE, conf.get(VOCPConstants.INPUTTABLE))
conf.set(TableOutputFormat.OUTPUT_TABLE, conf.get(VOCPConstants.OUTPUTTABLE))
val job: Job = Job.getInstance(conf, "NLPAnnotationJob")
job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, conf.get(VOCPConstants.OUTPUTTABLE))
job.setOutputFormatClass(classOf[MultiTableOutputFormat])
val admin = new HBaseAdmin(conf)
if (!admin.isTableAvailable(conf.get(VOCPConstants.OUTPUTTABLE))) {
val tableDesc = new HTableDescriptor(TableName.valueOf(conf.get(VOCPConstants.OUTPUTTABLE)))
admin.createTable(tableDesc)
}
val scan = new Scan()
scan.setCaching(5000)
scan.setCacheBlocks(false)
val inputCfs = conf.get(VOCPConstants.INPUTCOLUMNFAMILIES).split(",")
inputCfs.foreach { x => scan.addFamily(Bytes.toBytes(x)) }
val proto = ProtobufUtil.toScan(scan)
val scan_string = Base64.encodeBytes(proto.toByteArray)
job.getConfiguration().set(TableInputFormat.SCAN, scan_string)
val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
val processedFilteredRDD = hBaseRDD.map(x => x._2)
.filter{ result =>
val to_process : String = Bytes.toString(result.getValue(Bytes.toBytes("p"),
Bytes.toBytes("to_process")))
val user_flag : String = Bytes.toString(result.getValue(Bytes.toBytes("p"),
Bytes.toBytes("user_flag")))
(to_process != "0" && user_flag == null)
}
val messageRDD = processedFilteredRDD.map { result =>
val message = Bytes.toString(result.getValue(Bytes.toBytes("p"),
Bytes.toBytes("message")))
(Bytes.toString(result.getRow()), message)
}
val filterRDD = messageRDD.filter { x =>
var flag = true
if (x._2 == null || x._2.toString.trim.length == 0) {
flag = false
}
(flag)
}
val pluginHome = "plugins"
val pipelineRDD = filterRDD.mapPartitions { iter =>
val pipe = new pipeline(pluginHome)
iter.map { x =>
val result = pipe.exec(x._2.toLowerCase())
val resultJson: JSONObject = new JSONObject(result.trim())
var Json: JSONArray = resultJson.get("cancerStage").asInstanceOf[JSONArray]
val CancerStage: String = controller_java.getEntity(Json, "cancerStage")
Json = resultJson.get("age").asInstanceOf[JSONArray]
val Age: String = controller_java.getEntity(Json, "age")
Json = resultJson.get("location").asInstanceOf[JSONArray]
val Location: String = controller_java.getEntity(Json, "location")
Json = resultJson.get("her2Status").asInstanceOf[JSONArray]
val Her2Status: String = controller_java.getEntity(Json, "her2Status")
(x._1, x._2, Age, Location, Her2Status, CancerStage)
}
}
val outputBroadcast = sc.broadcast(conf.get(VOCPConstants.OUTPUTTABLE))
val inputBroadcast = sc.broadcast(conf.get(VOCPConstants.INPUTTABLE))
val filterPipelineRDD = pipelineRDD.filter { x =>
var flag = true
if (x._3 == null && x._4 == null && x._5 == null && x._6 == null) flag = false
(flag)
}
val newRDD = filterPipelineRDD.flatMap{x => convertToPut(x, outputBroadcast, inputBroadcast)}
newRDD.saveAsNewAPIHadoopDataset(job.getConfiguration())
return 0
}
}
def convertToPut(row: (String, String, String, String, String, String),
output_table: Broadcast[String], input_table: Broadcast[String]):
List[(ImmutableBytesWritable, Put)] = {
var putList: List[(ImmutableBytesWritable, Put)] = List()
val rowkey = row._1
val message = row._2
val age = row._3
val location = row._4
val her2status = row._5
val cancerStage = row._6
val put = new Put(Bytes.toBytes(rowkey))
val put_input = new Put(Bytes.toBytes(rowkey))
val flagCFDataBytes = Bytes.toBytes("p")
val pfamily = Bytes.toBytes("data")
put.add(flagCFDataBytes, Bytes.toBytes("message"), Bytes.toBytes(message))
put_input.add(flagCFDataBytes, Bytes.toBytes("user_flag"), Bytes.toBytes("1"))
if(age != null)
put.add(pfamily, Bytes.toBytes("age"), Bytes.toBytes(age))
if(location != null)
put.add(pfamily, Bytes.toBytes("location"), Bytes.toBytes(location))
if(her2status != null)
put.add(pfamily, Bytes.toBytes("her2Status"), Bytes.toBytes(her2status))
if(cancerStage != null)
put.add(pfamily, Bytes.toBytes("cancerStage"), Bytes.toBytes(cancerStage))
putList = putList:+(new ImmutableBytesWritable(Bytes.toBytes(output_table.value)), put)
putList = putList:+(new ImmutableBytesWritable(Bytes.toBytes(input_table.value)), put_input)
return putList
}
def pipeLineExecute(args: Array[String]): Int = {
var batchString = ""
val usage = "Usage: NLPAnnotationController" + " -inputTable tableName -outputTable tableName" +
" -batchId batchId / -newbatch \n"
if (args.length == 0) {
System.err.println(usage)
return -1
}
var i = 0
while ( i < args.length){
if ("-inputTable" == args(i)) {
conf.set(VOCPConstants.INPUTTABLE, args(i+1))
i=i+2
} else if ("-outputTable" == args(i)) {
conf.set(VOCPConstants.OUTPUTTABLE, args(i+1))
i=i+2
} else if ("-batchId" == args(i)) {
batchString = args(i+1)
i=i+2
} else if ("-newbatch" == args(i)) {
batchString = "newbatch"
i=i+1
} else {
throw new IllegalArgumentException("arg " + args(i) + " not recognized")
}
}
val result = nlpAnnotationExtraction(batchString)
result
}
def main(args: Array[String]) {
val res = pipeLineExecute(args)
System.exit(res)
}
}
The error stack: 错误堆栈:
java.lang.OutOfMemoryError: Java heap space
at java.util.HashMap.newNode(HashMap.java:1742)
at java.util.HashMap.putVal(HashMap.java:630)
at java.util.HashMap.putMapEntries(HashMap.java:514)
at java.util.HashMap.putAll(HashMap.java:784)
at gate.annotation.AnnotationSetImpl.<init>(AnnotationSetImpl.java:164)
at gate.jape.SinglePhaseTransducer.attemptAdvance(SinglePhaseTransducer.java:572)
at gate.jape.SinglePhaseTransducer.transduce(SinglePhaseTransducer.java:338)
at gate.jape.MultiPhaseTransducer.transduce(MultiPhaseTransducer.java:188)
at gate.jape.Batch.transduce(Batch.java:204)
at gate.creole.Transducer.execute(Transducer.java:166)
at gate.util.Benchmark.executeWithBenchmarking(Benchmark.java:291)
at gate.creole.SerialController.runComponent(SerialController.java:225)
at gate.creole.SerialController.executeImpl(SerialController.java:157)
at gate.creole.SerialAnalyserController.executeImpl(SerialAnalyserController.java:223)
at gate.creole.SerialAnalyserController.execute(SerialAnalyserController.java:126)
at com.vocp.userProfile.UserProfile.pipeline.exec(pipeline.java:75)
at com.vocp.userProfile.runner$$anonfun$5$$anonfun$apply$1.apply(runner.scala:117)
at com.vocp.userProfile.runner$$anonfun$5$$anonfun$apply$1.apply(runner.scala:116)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$14.hasNext(Iterator.scala:389)
at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12$$anonfun$apply$4.apply$mcV$sp(PairRDDFunctions.scala:1111)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12$$anonfun$apply$4.apply(PairRDDFunctions.scala:1111)
Things I've already done. 我已经做过的事情。
Any help would be useful. 任何帮助都会有用。 Thanks in advance.
提前致谢。
I'd suggest you reduce your input RDD size to half or quarter and see if that works 我建议你将输入的RDD大小减少到一半或四分之一,看看是否有效
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.