花费1个小时将1GB数据加载到hbase中

Question

I want to load 1GB (10 Million Records) CSV file into Hbase. 我想将1GB（1000万条记录）的CSV文件加载到Hbase中。 I wrote Map-Reduce Program for it. 我为此写了Map-Reduce程序。 My Code is working fine but taking 1 hour to complete. 我的代码工作正常，但需要1个小时才能完成。 Last Reducer is taking more than half an hour time. Last Reducer耗时超过半小时。 Could anyone please help me out? 有人可以帮我吗？

My Code is as follows: 我的代码如下：

Driver.Java 驱动程序

package com.cloudera.examples.hbase.bulkimport;

    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.hbase.HBaseConfiguration;
    import org.apache.hadoop.hbase.KeyValue;
    import org.apache.hadoop.hbase.client.HTable;
    import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
    import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

    /**
     * HBase bulk import example

     * Data preparation MapReduce job driver
     * 
 

 
 
 
  
  
  
 
  
     * 
  
 
  
  
  args[0]: HDFS input path * 
  
 
  
  
  args[1]: HDFS output path * 
  
 
  
  
  args[2]: HBase table name * 
 

 
 
 
     */
    public class Driver {
      public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();

        /*
         * NBA Final 2010 game 1 tip-off time (seconds from epoch) 
         * Thu, 03 Jun 2010 18:00:00 PDT
         */
      //  conf.setInt("epoch.seconds.tipoff", 1275613200);
        conf.set("hbase.table.name", args[2]);

        // Load hbase-site.xml 
        HBaseConfiguration.addHbaseResources(conf);

        Job job = new Job(conf, "HBase Bulk Import Example");
        job.setJarByClass(HBaseKVMapper.class);

        job.setMapperClass(HBaseKVMapper.class);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(KeyValue.class);

        job.setInputFormatClass(TextInputFormat.class);

        HTable hTable = new HTable(conf, args[2]);

        // Auto configure partitioner and reducer
        HFileOutputFormat.configureIncrementalLoad(job, hTable);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        job.waitForCompletion(true);

        // Load generated HFiles into table
    //    LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf);
    //    loader.doBulkLoad(new Path(args[1]), hTable);
      }
    }

HColumnEnum.java HColumnEnum.java

package com.cloudera.examples.hbase.bulkimport;

    /**
     * HBase table columns for the 'srv' column family
     */
    public enum HColumnEnum {
      SRV_COL_employeeid ("employeeid".getBytes()),
      SRV_COL_eventdesc ("eventdesc".getBytes()),
      SRV_COL_eventdate ("eventdate".getBytes()),
      SRV_COL_objectname ("objectname".getBytes()),
      SRV_COL_objectfolder ("objectfolder".getBytes()),
      SRV_COL_ipaddress ("ipaddress".getBytes());

      private final byte[] columnName;

      HColumnEnum (byte[] column) {
        this.columnName = column;
      }

      public byte[] getColumnName() {
        return this.columnName;
      }
    }

HBaseKVMapper.java HBaseKVMapper.java

 package com.cloudera.examples.hbase.bulkimport; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import au.com.bytecode.opencsv.CSVParser; /** * HBase bulk import example * <p> * Parses Facebook and Twitter messages from CSV files and outputs * <ImmutableBytesWritable, KeyValue>. * <p> * The ImmutableBytesWritable key is used by the TotalOrderPartitioner to map it * into the correct HBase table region. * <p> * The KeyValue value holds the HBase mutation information (column family, * column, and value) */ public class HBaseKVMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, KeyValue> { final static byte[] SRV_COL_FAM = "srv".getBytes(); final static int NUM_FIELDS = 6; CSVParser csvParser = new CSVParser(); int tipOffSeconds = 0; String tableName = ""; // DateTimeFormatter p = DateTimeFormat.forPattern("MMM dd, yyyy HH:mm:ss") // .withLocale(Locale.US).withZone(DateTimeZone.forID("PST8PDT")); ImmutableBytesWritable hKey = new ImmutableBytesWritable(); KeyValue kv; /** {@inheritDoc} */ @Override protected void setup(Context context) throws IOException, InterruptedException { Configuration c = context.getConfiguration(); // tipOffSeconds = c.getInt("epoch.seconds.tipoff", 0); tableName = c.get("hbase.table.name"); } /** {@inheritDoc} */ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { /*if (value.find("Service,Term,") > -1) { // Skip header return; }*/ String[] fields = null; try { fields = value.toString().split(","); //csvParser.parseLine(value.toString()); } catch (Exception ex) { context.getCounter("HBaseKVMapper", "PARSE_ERRORS").increment(1); return; } if (fields.length != NUM_FIELDS) { context.getCounter("HBaseKVMapper", "INVALID_FIELD_LEN").increment(1); return; } // Get game offset in seconds from tip-off /* DateTime dt = null; try { dt = p.parseDateTime(fields[9]); } catch (Exception ex) { context.getCounter("HBaseKVMapper", "INVALID_DATE").increment(1); return; } int gameOffset = (int) ((dt.getMillis() / 1000) - tipOffSeconds); String offsetForKey = String.format("%04d", gameOffset); String username = fields[2]; if (username.equals("")) { username = fields[3]; }*/ // Key: eg "1200:twitter:jrkinley" hKey.set(String.format("%s|%s|%s|%s|%s|%s", fields[0], fields[1], fields[2],fields[3],fields[4],fields[5]) .getBytes()); // Service columns if (!fields[0].equals("")) { kv = new KeyValue(hKey.get(), SRV_COL_FAM, HColumnEnum.SRV_COL_employeeid.getColumnName(), fields[0].getBytes()); context.write(hKey, kv); } if (!fields[1].equals("")) { kv = new KeyValue(hKey.get(), SRV_COL_FAM, HColumnEnum.SRV_COL_eventdesc.getColumnName(), fields[1].getBytes()); context.write(hKey, kv); } if (!fields[2].equals("")) { kv = new KeyValue(hKey.get(), SRV_COL_FAM, HColumnEnum.SRV_COL_eventdate.getColumnName(), fields[2].getBytes()); context.write(hKey, kv); } if (!fields[3].equals("")) { kv = new KeyValue(hKey.get(), SRV_COL_FAM, HColumnEnum.SRV_COL_objectname.getColumnName(), fields[3].getBytes()); context.write(hKey, kv); } if (!fields[4].equals("")) { kv = new KeyValue(hKey.get(), SRV_COL_FAM, HColumnEnum.SRV_COL_objectfolder.getColumnName(), fields[4].getBytes()); context.write(hKey, kv); } if (!fields[5].equals("")) { kv = new KeyValue(hKey.get(), SRV_COL_FAM, HColumnEnum.SRV_COL_ipaddress.getColumnName(), fields[5].getBytes()); context.write(hKey, kv); } context.getCounter("HBaseKVMapper", "NUM_MSGS").increment(1); /* * Output number of messages per quarter and before/after game. This should * correspond to the number of messages per region in HBase */ /* if (gameOffset < 0) { context.getCounter("QStats", "BEFORE_GAME").increment(1); } else if (gameOffset < 900) { context.getCounter("QStats", "Q1").increment(1); } else if (gameOffset < 1800) { context.getCounter("QStats", "Q2").increment(1); } else if (gameOffset < 2700) { context.getCounter("QStats", "Q3").increment(1); } else if (gameOffset < 3600) { context.getCounter("QStats", "Q4").increment(1); } else { context.getCounter("QStats", "AFTER_GAME").increment(1); }*/ } }

Please help me to improve the performance or Please let me know if you have any alternate solution with sample code. 请帮助我提高性能，或者如果您有示例代码的替代解决方案，请告诉我。

MY mapred-site.xml 我的mapred-site.xml

 <?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<!-- Put site-specific property overrides in this file. -->

<configuration>

<property>
  <name>mapred.job.tracker</name>
    <value>namenode:54311</value>
    </property>

<property>
  <name>mapred.reduce.parallel.copies</name>
    <value>20</value>
    </property>

<property>
  <name>tasktracker.http.threads</name>
    <value>50</value>
    </property>

<property>
  <name>mapred.job.shuffle.input.buffer.percent</name>
    <value>0.70</value>
    </property>

<property>
  <name>mapred.tasktracker.map.tasks.maximum</name>
    <value>4</value>
    </property>

<property>
  <name>mapred.tasktracker.reduce.tasks.maximum</name>
    <value>4</value>
    </property>

<property>
  <name>mapred.map.tasks</name>
    <value>4</value>
    </property>

<property>
  <name>reduce.map.tasks</name>
    <value>4</value>
    </property>

<property>
  <name>mapred.job.shuffle.merge.percent</name>
    <value>0.65</value>
    </property>

<property>
  <name>mapred.task.timeout</name>
    <value>1200000</value>
    </property>

<property>
    <name>mapred.child.java.opts</name>
        <value>-Xms1024M -Xmx2048M</value>
        </property>



<property>
  <name>mapred.job.reuse.jvm.num.tasks</name>
    <value>-1</value>
    </property>

<property>
    <name>mapred.compress.map.output</name>
    <value>true</value>
</property>

<property>
    <name>mapred.map.output.compression.codec</name>
    <value>com.hadoop.compression.lzo.LzoCodec</value>
</property>

<property>
    <name>io.sort.mb</name>
    <value>800</value>
</property>


<property>
  <name>mapred.child.ulimit</name>
    <value>unlimited</value>
    </property>

<property>
<name>io.sort.factor</name>
<value>100</value>
<description>More streams merged at once while sorting files.</description>
</property>  


 <property>
 <name>mapreduce.admin.map.child.java.opts</name>
 <value>-Djava.net.preferIPv4Stack=true</value>
 </property>
 <property>
 <name>mapreduce.admin.reduce.child.java.opts</name>
 <value>-Djava.net.preferIPv4Stack=true</value>
 </property>


<property>
   <name>mapred.min.split.size</name>
   <value>0</value>
</property>

<property>
   <name>mapred.job.map.memory.mb</name>
     <value>-1</value>
     </property>

<property>
   <name>mapred.jobtracker.maxtasks.per.job</name>
        <value>-1</value>
             </property>


</configuration>

hbase-site.xml hbase-site.xml

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
    <name>hbase.rootdir</name>
    <value>hdfs://namenode:54310/hbase</value>
    <description>The directory shared by RegionServers.
    </description>
</property>

<property>
    <name>hbase.master</name>
    <value>slave:60000</value>
    <description>The host and port that the HBase master runs at.
    A value of 'local' runs the master and a regionserver
    in a single process.
    </description>
</property>

<property>
    <name>hbase.cluster.distributed</name>
    <value>true</value>
    <description>The mode the cluster will be in. Possible values are
    false: standalone and pseudo-distributed setups with managed Zookeeper
    true: fully-distributed with unmanaged Zookeeper Quorum (see hbase-env.sh)
    </description>
</property>

<property>
    <name>hbase.zookeeper.quorum</name>
    <value>slave</value>
    <description>Comma separated list of servers in the ZooKeeper Quorum.
    For example, "host1.mydomain.com,host2.mydomain.com,host3.mydomain.com".
    By default this is set to localhost for local and pseudo-distributed modes
    of operation. For a fully-distributed setup, this should be set to a full
    list of ZooKeeper quorum servers. If HBASE_MANAGES_ZK is set in hbase-env.sh
    this is the list of servers which we will start/stop ZooKeeper on.
    </description>
</property>

<property>
       <name>hbase.zookeeper.property.clientPort</name>
       <value>2181</value>
</property>

<property>
    <name>hbase.zookeeper.property.dataDir</name>
    <value>/home/hduser/work/zoo_data</value>
    <description>Property from ZooKeeper's config zoo.cfg.
    The directory where the snapshot is stored.
    </description>
</property>

</configuration>

Please help me out so i can improve my performance. 请帮帮我，以便我改善我的表现。

Answer 1

First thing, why do we need Mapreduce program for loading data in to Hbase for such a small file(1GB). 首先，为什么我们需要Mapreduce程序将如此小的文件（1GB）的数据加载到Hbase中。

In my experience I have processed 5GB Json using Jackson streaming( I don't want to take all json in to memory) and persisted in Hbase with in 8 minutes, by using batching technique. 以我的经验，我使用杰克逊流处理了5GB的Json（我不想将所有json都带到内存中），并使用批处理技术在8分钟内坚持使用Hbase。

I used hbase puts in batch List objects of 100000 record. 我用hbase放入了100000条记录的批处理List对象。

Below is code snippet through which I achieved this. 以下是我实现此目标的代码段。 Same thing can be done while parsing other formats as well) 解析其他格式时也可以执行相同的操作）

May be you need to call this method in 2 places 可能您需要在2个地方调用此方法

1) with Batch of 100000 records. 1）具有100000条记录的批次。

2) For processing reminder of your batch records are less than 100000 2）用于处理提醒的您的批处理记录少于100000

  public void addRecord(final ArrayList<Put> puts, final String tableName) throws Exception {
        try {
            final HTable table = new HTable(HBaseConnection.getHBaseConfiguration(), getTable(tableName));
            table.put(puts);
            LOG.info("INSERT record[s] " + puts.size() + " to table " + tableName + " OK.");
        } catch (final Throwable e) {
            e.printStackTrace();
        } finally {
            LOG.info("Processed ---> " + puts.size());
            if (puts != null) {
                puts.clear();
            }
        }
    }

Answer 2

I have created only mapper class and take hbase output format class. 我仅创建了mapper类，并使用了hbase输出格式类。 Now it's taking 10 Min. 现在需要10分钟。 My Network speed is very slow that is why it's taking long time. 我的网络速度非常慢，这就是为什么它需要很长时间的原因。

Answer 3

It can be further fine tuned by specifying the number of Region splits to be used while creating the Hbase table. 通过指定在创建Hbase表时要使用的Region分割数，可以对其进行进一步的微调。 As the number of reducers instances for bulk loading will also depend on the number of Regions. 由于reducer实例的数量也将取决于区域的数量。 This can be done with the following command 可以使用以下命令完成

hbase org.apache.hadoop.hbase.util.RegionSplitter -c <number of regions> -f <column families> <New Hbase Table Name> <splitAlgorithm>

for split algorithm one can specify 对于拆分算法，可以指定

UniformSplit - treats keys as arbitrary bytes UniformSplit将密钥视为任意字节
HexStringSplit - treats keys as hexa decimal ASCII HexStringSplit将键视为十六进制ASCII

花费1个小时将1GB数据加载到hbase中

问题描述

3 个解决方案

解决方案1
5 2016-05-02 07:51:43

解决方案2
0 已采纳 2014-05-26 14:14:22

解决方案3
0 2014-09-25 11:39:04

花费1个小时将1GB数据加载到hbase中

问题描述

3 个解决方案

解决方案1 5 2016-05-02 07:51:43

解决方案2 0 已采纳 2014-05-26 14:14:22

解决方案3 0 2014-09-25 11:39:04

解决方案1
5 2016-05-02 07:51:43

解决方案2
0 已采纳 2014-05-26 14:14:22

解决方案3
0 2014-09-25 11:39:04