简体   繁体   中英

Spark Hbase inserts too many connections open

I'm trying to perform inserts to Hbase after parsing some text and the code works fine but i think it can be organized for better performance. in the below im opening a connection in a loop and i would like to know how i may be able to open a single connection and use it for all inserts. i think i would need to the connection to a function for this to happen.

def extractInfo(fp:String) = {
val p:Parser = new AutoDetectParser()
val fs = FileSystem.get(new java.net.URI("XXXXXXXXXX"), new Configuration())
val inputPath:Path = new Path(fp)
val is:InputStream = fs.open(inputPath)
val handler:BodyContentHandler = new BodyContentHandler(-1)
val metadata:Metadata = new Metadata()
try{
p.parse(is, handler, metadata, new ParseContext())
is.close()
val hand = handler.toString()
val gson = new Gson
val jsonTree = gson.toJsonTree(metadata)
val metaNode = jsonTree.getAsJsonObject().getAsJsonObject("metadata")
val jsonString = gson.toJson(metaNode)
if (hand.trim().isEmpty()){
println("no Text extracted", inputPath)
} else {
println("Success")
}
val fname = "ABC"
val configuration: Configuration = HBaseConfiguration.create()
configuration.set("hbase.zookeeper.quorum", "XXXX")
configuration.set("hbase.zookeeper.property.clientPort", "XXXX")
configuration.set("zookeeper.znode.parent", "/hbase-XXX") 
configuration.set("hbase.client.keyvalue.maxsize", "0")
val principal = System.getProperty("kerberosPrincipal", "XXXXX")
val keytabLocation = System.getProperty("kerberosKeytab", "XXXXXXXXX")
UserGroupInformation.setConfiguration(configuration)
UserGroupInformation.loginUserFromKeytab(principal, keytabLocation)
val connection = ConnectionFactory.createConnection(HBaseConfiguration.create(configuration))
val admin = connection.getAdmin
val hTable:HTable = new HTable(configuration, "XXXXXXXXX")
val g = new Put(Bytes.toBytes(fname))
g.add(Bytes.toBytes("txt"),Bytes.toBytes("text"),Bytes.toBytes(hand))
hTable.put(g)
val m = new Put(Bytes.toBytes(fname))
m.add(Bytes.toBytes("data"),Bytes.toBytes("info"),Bytes.toBytes(jsonString))
hTable.put(m)
hTable.close()
fs.close()
}
catch {
case e : Throwable => {
println(e.printStackTrace)
}
}
} 


object App {
def main(args : Array[String]) {
val fnames = "/X/X/XXXXX.XXX"
fnames.foreach{x => extractInfo(x) }
}
}

in spark, if you are updating Hbase from executors not driver, then create connection per executor, so connection will be reused in the same executor. this way you can same connection creation time overhead, but create table object per thread because hbase table object is not thread-safe(check out Hbase client official document).

and of course, close table and connection whenever you are done.

// this object will make connection reside on executor side not Driver
// and serve as a singleton per executor JVM, which makes connection shared between executor threads, connection is thread-safe!
object HbaseHandler {
  var connection: Option[Connection] = None

  def put(put: Put): Unit = {
    if(connection.isEmpty) {
      val conn = ConnectionFactory.createConnection(HBaseConfiguration.create(configuration))

      connection = Some(conn)
    }

    connection.get.<do some stuff>
    val table = ...
    try {
      table.put(put)
    } finally {  
      table.close()
    }
  }
} 

...

rdd.foreach (
  row => {
    val put: Put = <generate put object>
    HbaseHandler.put(put)
  }
)

=========== as in the code example above =========

object Hbase {
  private var hbaseConnection: Option[Connection] = None

  private def connection: Connection = {
    if(hbaseConnection.isEmpty) {
      hbaseConnection = Some(ConnectionFactory.createConnection(HBaseConfiguration.create(configuration)))
    }

    hbaseConnection.get
  }
  def extractInfo(fp: String) = {
    val p: Parser = new AutoDetectParser()
    val fs = FileSystem.get(new java.net.URI("XXXXXXXXXX"), new Configuration())
    val inputPath: Path = new Path(fp)
    val is: InputStream = fs.open(inputPath)
    val handler: BodyContentHandler = new BodyContentHandler(-1)
    val metadata: Metadata = new Metadata()
    try {
      p.parse(is, handler, metadata, new ParseContext())
      is.close()
      val hand = handler.toString()
      val gson = new Gson
      val jsonTree = gson.toJsonTree(metadata)
      val metaNode = jsonTree.getAsJsonObject().getAsJsonObject("metadata")
      val jsonString = gson.toJson(metaNode)
      if (hand.trim().isEmpty()) {
        println("no Text extracted", inputPath)
      } else {
        println("Success")
      }
      val fname = "ABC"
      val configuration: Configuration = HBaseConfiguration.create()
      configuration.set("hbase.zookeeper.quorum", "XXXX")
      configuration.set("hbase.zookeeper.property.clientPort", "XXXX")
      configuration.set("zookeeper.znode.parent", "/hbase-XXX")
      configuration.set("hbase.client.keyvalue.maxsize", "0")
      val principal = System.getProperty("kerberosPrincipal", "XXXXX")
      val keytabLocation = System.getProperty("kerberosKeytab", "XXXXXXXXX")
      UserGroupInformation.setConfiguration(configuration)
      UserGroupInformation.loginUserFromKeytab(principal, keytabLocation)
      val admin = connection.getAdmin
      val hTable: HTable = new HTable(configuration, "XXXXXXXXX")
      val g = new Put(Bytes.toBytes(fname))
      g.add(Bytes.toBytes("txt"), Bytes.toBytes("text"), Bytes.toBytes(hand))
      hTable.put(g)
      val m = new Put(Bytes.toBytes(fname))
      m.add(Bytes.toBytes("data"), Bytes.toBytes("info"), Bytes.toBytes(jsonString))
      hTable.put(m)
      hTable.close()
      fs.close()
    }
    catch {
      case e: Throwable => {
        println(e.printStackTrace)
      }
    }
  }
}


object App {
  def main(args : Array[String]) {
    val fnames = "/X/X/XXXXX.XXX"
    fnames.foreach{x => Hbase.extractInfo(x) }
  }
}

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM