[英]Exception when using the saveToPhoenix method to load/save a RDD on Hbase
[英]Output directory not set exception when save RDD to hbase with spark
我有一个工作要从hbase检索数据,使用spark作为rdd并执行过滤,然后将其作为样本数据保存回基础,如下所示:
object FilterData {
def main(args: Array[String]) {
filterData()
}
def filterData() = {
val sparkConf = new SparkConf().setAppName("filterData").setMaster("spark://spark:7077")
val sc = new SparkContext(sparkConf)
val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum", "172.16.1.10,172.16.1.11,172.16.1.12")
conf.setInt("timeout", 120000)
conf.set(TableInputFormat.INPUT_TABLE, "dataset")
val hbaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])
val filtered = hbaseRDD.filter{
case tuple => {
val result = tuple._2
val hostId = new String(result.getValue("user", "id"))
hostId == "12345" // <-- only retrieve the row when user:id is 12345
}
}
val jobConfig: JobConf = new JobConf(conf, this.getClass)
jobConfig.setOutputFormat(classOf[TableOutputFormat])
jobConfig.set(TableOutputFormat.OUTPUT_TABLE, "sample_data")
filtered.saveAsNewAPIHadoopDataset(jobConfig)
}
}
我已经使用hbase shell在hbase中创建了一个表,并指定了列Family
但是当运行此代码时,它会给出以下异常:
Exception in thread "main" org.apache.hadoop.mapred.InvalidJobConfException: Output directory not set.
at org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.checkOutputSpecs(FileOutputFormat.java:138)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1088)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1074)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1074)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
at org.apache.spark.rdd.PairRDDFunctions.saveAsNewAPIHadoopDataset(PairRDDFunctions.scala:1074)
at com.xinmeiti.spark.weibo.FilterData$.filterData(FilterData.scala:45)
at com.xinmeiti.spark.weibo.FilterData$.main(FilterData.scala:20)
at com.xinmeiti.spark.weibo.FilterData.main(FilterData.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:731)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
什么地方出了错?
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
object FilterData {
def main(args: Array[String]) {
filterData()
}
def filterData() = {
val sparkConf = new SparkConf().setAppName("filterData").setMaster("spark://spark:7077")
val sc = new SparkContext(sparkConf)
val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum", "172.16.1.10,172.16.1.11,172.16.1.12")
conf.setInt("timeout", 120000)
conf.set(TableInputFormat.INPUT_TABLE, "dataset")
val hbaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])
val filtered = hbaseRDD.filter {
case tuple => {
val result = tuple._2
val hostId = new String(result.getValue("user".getBytes, "id".getBytes))
hostId == "12345" // <-- only retrieve the row when user:id is 12345
}
}
val jobConfig: Job = Job.getInstance(conf)
jobConfig.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
jobConfig.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, "sample_data")
filtered.saveAsNewAPIHadoopDataset(jobConfig.getConfiguration())
}
}
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.