简体   繁体   English

java.lang.OutOfMemoryError:spark应用程序中的Java堆空间

[英]java.lang.OutOfMemoryError: Java heap space in spark application

I am running a spark application that reads message from a very large (~7M) table, processes the message and writes the result back to the same table. 我正在运行一个spark应用程序,它从一个非常大的(~7M)表读取消息,处理消息并将结果写回同一个表。 The application shall work fine with small table however on this table I get the error as mentioned above. 应用程序应该可以正常使用小表,但是在这个表上我得到了如上所述的错误。

Code

package com.vocp.userProfile


import java.util.Arrays
import java.util.ArrayList

import gate.util.GateException
import java.net.MalformedURLException

import org.codehaus.jettison.json.JSONArray
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.HConstants
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.mapreduce.{MultiTableOutputFormat, TableOutputFormat}
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.KeyValue.Type
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import java.util.ArrayList

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.log4j.{Level, Logger}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.codehaus.jettison.json.{JSONArray, JSONObject}
import com.vocp.userProfile.UserProfile.{Utility, controller_java, pipeline}
import com.vocp.userProfile.Configuration.{VOCPConstants, VocpConfiguration}
import org.apache.hadoop.hbase.client.Scan
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.util.{Base64, Bytes}

/**
  * Created by sahil on 12/4/17.
  */

class runner {
  var log: Logger = Logger.getLogger(classOf[runner])
  val conf = VocpConfiguration.create()
}

object runner {
  val run = new runner
  val conf = run.conf
  val log = run.log

  @throws(classOf[Exception])
  def nlpAnnotationExtraction(batchString: String): Int = {

    log.info("In Main Object..")

    //Initializing Spark Context
    val sc = new SparkContext(new SparkConf().setAppName("TheExorcist"))
    sc.addFile(conf.get(VOCPConstants.GATE_PLUGIN_ARCHIVE), true)
    val batchId =
      if (batchString == "newbatch")
        java.lang.Long.toString(System.currentTimeMillis())
      else batchString

    conf.set("batchId", batchId)
    try {

      conf.set(TableInputFormat.INPUT_TABLE, conf.get(VOCPConstants.INPUTTABLE))
      conf.set(TableOutputFormat.OUTPUT_TABLE, conf.get(VOCPConstants.OUTPUTTABLE))

      val job: Job = Job.getInstance(conf, "NLPAnnotationJob")
      job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, conf.get(VOCPConstants.OUTPUTTABLE))
      job.setOutputFormatClass(classOf[MultiTableOutputFormat])

      val admin = new HBaseAdmin(conf)
      if (!admin.isTableAvailable(conf.get(VOCPConstants.OUTPUTTABLE))) {
        val tableDesc = new HTableDescriptor(TableName.valueOf(conf.get(VOCPConstants.OUTPUTTABLE)))
        admin.createTable(tableDesc)
      }
      val scan = new Scan()
      scan.setCaching(5000)
      scan.setCacheBlocks(false)

      val inputCfs = conf.get(VOCPConstants.INPUTCOLUMNFAMILIES).split(",")
      inputCfs.foreach { x => scan.addFamily(Bytes.toBytes(x)) }

      val proto = ProtobufUtil.toScan(scan)
      val scan_string = Base64.encodeBytes(proto.toByteArray)
      job.getConfiguration().set(TableInputFormat.SCAN, scan_string)

      val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
        classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
        classOf[org.apache.hadoop.hbase.client.Result])
      val processedFilteredRDD = hBaseRDD.map(x => x._2)
        .filter{ result =>
          val to_process : String = Bytes.toString(result.getValue(Bytes.toBytes("p"),
            Bytes.toBytes("to_process")))
          val user_flag : String = Bytes.toString(result.getValue(Bytes.toBytes("p"),
            Bytes.toBytes("user_flag")))
          (to_process != "0" && user_flag == null)
        }
      val messageRDD = processedFilteredRDD.map { result =>
        val message = Bytes.toString(result.getValue(Bytes.toBytes("p"),
          Bytes.toBytes("message")))
        (Bytes.toString(result.getRow()), message)
      }
      val filterRDD = messageRDD.filter { x =>
        var flag = true
        if (x._2 == null || x._2.toString.trim.length == 0) {
          flag = false
        }
        (flag)
      }

      val pluginHome = "plugins"
      val pipelineRDD = filterRDD.mapPartitions { iter =>
        val pipe = new pipeline(pluginHome)
        iter.map { x =>
          val result = pipe.exec(x._2.toLowerCase())
          val resultJson: JSONObject = new JSONObject(result.trim())
          var Json: JSONArray = resultJson.get("cancerStage").asInstanceOf[JSONArray]
          val CancerStage: String = controller_java.getEntity(Json, "cancerStage")
          Json = resultJson.get("age").asInstanceOf[JSONArray]
          val Age: String = controller_java.getEntity(Json, "age")
          Json = resultJson.get("location").asInstanceOf[JSONArray]
          val Location: String = controller_java.getEntity(Json, "location")
          Json = resultJson.get("her2Status").asInstanceOf[JSONArray]
          val Her2Status: String = controller_java.getEntity(Json, "her2Status")
          (x._1, x._2, Age, Location, Her2Status, CancerStage)
        }
      }
      val outputBroadcast = sc.broadcast(conf.get(VOCPConstants.OUTPUTTABLE))
      val inputBroadcast = sc.broadcast(conf.get(VOCPConstants.INPUTTABLE))

      val filterPipelineRDD = pipelineRDD.filter { x =>
        var flag = true
        if (x._3 == null && x._4 == null && x._5 == null && x._6 == null) flag = false
        (flag)
      }
      val newRDD = filterPipelineRDD.flatMap{x => convertToPut(x, outputBroadcast, inputBroadcast)}
      newRDD.saveAsNewAPIHadoopDataset(job.getConfiguration())
      return 0
    }
  }
  def convertToPut(row: (String, String, String,  String, String, String),
                   output_table: Broadcast[String], input_table: Broadcast[String]):
  List[(ImmutableBytesWritable, Put)] = {
    var putList: List[(ImmutableBytesWritable, Put)] = List()
    val rowkey = row._1
    val message = row._2
    val age = row._3
    val location = row._4
    val her2status = row._5
    val cancerStage = row._6
    val put = new Put(Bytes.toBytes(rowkey))
    val put_input = new Put(Bytes.toBytes(rowkey))
    val flagCFDataBytes = Bytes.toBytes("p")
    val pfamily = Bytes.toBytes("data")
    put.add(flagCFDataBytes, Bytes.toBytes("message"), Bytes.toBytes(message))
    put_input.add(flagCFDataBytes, Bytes.toBytes("user_flag"), Bytes.toBytes("1"))
    if(age != null)
      put.add(pfamily, Bytes.toBytes("age"), Bytes.toBytes(age))
    if(location != null)
      put.add(pfamily, Bytes.toBytes("location"), Bytes.toBytes(location))
    if(her2status != null)
      put.add(pfamily, Bytes.toBytes("her2Status"), Bytes.toBytes(her2status))
    if(cancerStage != null)
      put.add(pfamily, Bytes.toBytes("cancerStage"), Bytes.toBytes(cancerStage))
    putList = putList:+(new ImmutableBytesWritable(Bytes.toBytes(output_table.value)), put)
    putList = putList:+(new ImmutableBytesWritable(Bytes.toBytes(input_table.value)), put_input)
    return putList
  }
  def pipeLineExecute(args: Array[String]): Int = {

    var batchString = ""
    val usage = "Usage: NLPAnnotationController" + " -inputTable tableName -outputTable tableName" +
      " -batchId batchId / -newbatch \n"
    if (args.length == 0) {
      System.err.println(usage)
      return -1
    }

    var i = 0
    while ( i < args.length){
      if ("-inputTable" == args(i)) {
        conf.set(VOCPConstants.INPUTTABLE, args(i+1))
        i=i+2
      } else if ("-outputTable" == args(i)) {
        conf.set(VOCPConstants.OUTPUTTABLE, args(i+1))
        i=i+2
      } else if ("-batchId" == args(i)) {
        batchString = args(i+1)
        i=i+2
      } else if ("-newbatch" == args(i)) {
        batchString = "newbatch"
        i=i+1
      } else {
        throw new IllegalArgumentException("arg " + args(i) + " not recognized")
      }

    }
    val result = nlpAnnotationExtraction(batchString)
    result

  }

  def main(args: Array[String]) {
    val res = pipeLineExecute(args)
    System.exit(res)
  }
}

The error stack: 错误堆栈:

java.lang.OutOfMemoryError: Java heap space
        at java.util.HashMap.newNode(HashMap.java:1742)
        at java.util.HashMap.putVal(HashMap.java:630)
        at java.util.HashMap.putMapEntries(HashMap.java:514)
        at java.util.HashMap.putAll(HashMap.java:784)
        at gate.annotation.AnnotationSetImpl.<init>(AnnotationSetImpl.java:164)
        at gate.jape.SinglePhaseTransducer.attemptAdvance(SinglePhaseTransducer.java:572)
        at gate.jape.SinglePhaseTransducer.transduce(SinglePhaseTransducer.java:338)
        at gate.jape.MultiPhaseTransducer.transduce(MultiPhaseTransducer.java:188)
        at gate.jape.Batch.transduce(Batch.java:204)
        at gate.creole.Transducer.execute(Transducer.java:166)
        at gate.util.Benchmark.executeWithBenchmarking(Benchmark.java:291)
        at gate.creole.SerialController.runComponent(SerialController.java:225)
        at gate.creole.SerialController.executeImpl(SerialController.java:157)
        at gate.creole.SerialAnalyserController.executeImpl(SerialAnalyserController.java:223)
        at gate.creole.SerialAnalyserController.execute(SerialAnalyserController.java:126)
        at com.vocp.userProfile.UserProfile.pipeline.exec(pipeline.java:75)
        at com.vocp.userProfile.runner$$anonfun$5$$anonfun$apply$1.apply(runner.scala:117)
        at com.vocp.userProfile.runner$$anonfun$5$$anonfun$apply$1.apply(runner.scala:116)
        at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
        at scala.collection.Iterator$$anon$14.hasNext(Iterator.scala:389)
        at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371)
        at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12$$anonfun$apply$4.apply$mcV$sp(PairRDDFunctions.scala:1111)
        at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12$$anonfun$apply$4.apply(PairRDDFunctions.scala:1111)

Things I've already done. 我已经做过的事情。

  1. increased the executor memory --executor-memory 10g 增加了执行程序内存--executor-memory 10g
  2. optimised the code. 优化了代码。 I still think the GATE processing is causing the exception and consumes the memory. 我仍然认为GATE处理导致异常并消耗内存。

Any help would be useful. 任何帮助都会有用。 Thanks in advance. 提前致谢。

I'd suggest you reduce your input RDD size to half or quarter and see if that works 我建议你将输入的RDD大小减少到一半或四分之一,看看是否有效

Performing operations only on subset of a RDD 仅对RDD的子集执行操作

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

相关问题 Spark Cassandra聚合java.lang.OutOfMemoryError:Java堆空间 - Spark Cassandra Aggregation java.lang.OutOfMemoryError: Java heap space &#39;java.lang.OutOfMemoryError: Java heap space&#39; 在尝试读取 avro 文件并执行操作时 Spark 应用程序中出现错误 - 'java.lang.OutOfMemoryError: Java heap space' error in spark application while trying to read the avro file and performing Actions StringBuilder - java.lang.OutOfMemoryError: Java 堆空间 - StringBuilder - java.lang.OutOfMemoryError: Java heap space 避免在不增加堆空间的情况下避免java.lang.OutOfMemoryError异常? - Avoid the exception java.lang.OutOfMemoryError without growing the heap space? Spark Scala 代码中的“线程“dispatcher-event-loop-0”中的异常 java.lang.OutOfMemoryError: Java heap space &#39;错误 - 'Exception in thread "dispatcher-event-loop-0" java.lang.OutOfMemoryError: Java heap space ' error in Spark Scala code Spark Graphx java.lang.OutOfMemoryError - Spark Graphx java.lang.OutOfMemoryError 使用置换的原因:线程“ main”中的异常java.lang.OutOfMemoryError:Java堆空间 - Using permutations causes: Exception in thread “main” java.lang.OutOfMemoryError: Java heap space OutOfMemoryError:Spark中的Java堆空间和内存变量 - OutOfMemoryError: Java heap space and memory variables in Spark Spark:java.lang.OutOfMemoryError:超出了GC开销限制 - Spark: java.lang.OutOfMemoryError: GC overhead limit exceeded 如何在 Scala 编译时防止 java.lang.OutOfMemoryError: PermGen space? - How to prevent java.lang.OutOfMemoryError: PermGen space at Scala compilation?
 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM