简体   繁体   English

将多行JSON数据加载到HIVE表中

[英]Load multi-line JSON data into HIVE table

I have a JSON data which is a multi-line JSON. 我有一个JSON数据,它是多行JSON。 I have created a hive table to load that data into it. 我创建了一个配置单元表以将该数据加载到其中。 I have another JSON which is a single-line JSON record. 我还有另一个JSON,它是单行JSON记录。 When I load the single-line JSON record to its hive table and try to query, it works fine. 当我将单行JSON记录加载到其配置单元表并尝试查询时,它工作正常。 But when I load the multi-line JSON into its hive table, it gives below exception: 但是,当我将多行JSON加载到其配置单元表中时,出现以下异常:

Failed with exception java.io.IOException:org.apache.hadoop.hive.serde2.SerDeExcep‌​tion: org.codehaus.jackson.JsonParseException: Unexpected end-of-input: expected close marker for OBJECT (from [Source: java.io.ByteArrayInputStream@8b89b3a; line: 1, column: 0]) at [Source: java.io.ByteArrayInputStream@8b89b3a; line: 1, column: 3] 

Below is my JSON data: 以下是我的JSON数据:

{
  "uploadTimeStamp" : "1486631318873",
  "PDID" : "123",
  "data" : [ {
    "Data" : {
      "unit" : "rpm",
      "value" : "0"
    },
    "EventID" : "E1",
    "PDID" : "123",
    "Timestamp" : 1486631318873,
    "Timezone" : 330,
    "Version" : "1.0",
    "pii" : { }
  }, {
    "Data" : {
      "heading" : "N",
      "loc3" : "false",
      "loc" : "14.022425",
      "loc1" : "78.760587",
      "loc4" : "false",
      "speed" : "10"
    },
    "EventID" : "E2",
    "PDID" : "123",
    "Timestamp" : 1486631318873,
    "Timezone" : 330,
    "Version" : "1.1",
    "pii" : { }
  }, {
    "Data" : {
      "x" : "1.1",
      "y" : "1.2",
      "z" : "2.2"
    },
    "EventID" : "E3",
    "PDID" : "123",
    "Timestamp" : 1486631318873,
    "Timezone" : 330,
    "Version" : "1.0",
    "pii" : { }
  }, {
    "EventID" : "E4",
    "Data" : {
      "value" : "50",
      "unit" : "percentage"
    },
    "Version" : "1.0",
    "Timestamp" : 1486631318873,
    "PDID" : "123",
    "Timezone" : 330
  }, {
    "Data" : {
      "unit" : "kmph",
      "value" : "70"
    },
    "EventID" : "E5",
    "PDID" : "123",
    "Timestamp" : 1486631318873,
    "Timezone" : 330,
    "Version" : "1.0",
    "pii" : { }
  } ]
}

I am using /hive/lib/hive-hcatalog-core-0.13.0.jar 我正在使用/hive/lib/hive-hcatalog-core-0.13.0.jar

Below is my create table statement: 以下是我的创建表语句:

create table test7(
uploadtime bigint,
pdid string,
data array<
struct<Data:struct<
unit:string,
value:int>,
eventid:string,
pdid:bigint,
time:bigint,
timezone:int,
version:int,
pii:struct<pii:string>>,
struct<Data:struct<
heading:string,
Location:string,
latitude:bigint,
longitude:bigint,
Location2:string,
speed:int>,
eventid:string,
pdid:bigint,
time:bigint,
timezone:int,
version:int,
pii:struct<pii:string>>,
struct<Data:struct<
unit:string,
value:int>,
eventid:string,
pdid:bigint,
time:bigint,
timezone:int,
version:int,
pii:struct<pii:string>>,
struct<Data:struct<
x:int,
y:int,
z:int>,
eventid:string,
pdid:bigint,
time:bigint,
timezone:int,
version:int,
pii:struct<pii:string>>,
struct<Data:struct<
heading:string,
loc3:string,
latitude:bigint,
longitude:bigint,
loc4:string,
speed:int>,
eventid:string,
pdid:bigint,
time:bigint,
timezone:int,
version:int,
pii:struct<pii:string>>
>
)
ROW FORMAT SERDE 
'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS TEXTFILE
LOCATION
'/xyz/abc/';

Edit: 编辑:

Adding the single line JSON and new table create stmt with error: 添加单行JSON和新表会创建带有错误的stmt:

{"uploadTimeStamp":"1487183800905","PDID":"123","data":[{"Data":{"unit":"rpm","value":"0"},"EventID":"event1","PDID":"123","Timestamp":1487183800905,"Timezone":330,"Version":"1.0","pii":{}},{"Data":{"heading":"N","loc1":"false","latitude":"16.032425","longitude":"80.770587","loc2":"false","speed":"10"},"EventID":"event2","PDID":"123","Timestamp":1487183800905,"Timezone":330,"Version":"1.1","pii":{}},{"Data":{"x":"1.1","y":"1.2","z":"2.2"},"event3":"AccelerometerInfo","PDID":"123","Timestamp":1487183800905,"Timezone":330,"Version":"1.0","pii":{}},{"EventID":"event4","Data":{"value":"50","unit":"percentage"},"Version":"1.0","Timestamp":1487183800905,"PDID":"123","Timezone":330},{"Data":{"unit":"kmph","value":"70"},"EventID":"event5","PDID":"123","Timestamp":1487183800905,"Timezone":330,"Version":"1.0","pii":{}}]}

create table test1(
uploadTimeStamp string,
PDID string,
data array<struct<
Data:struct<unit:string,value:int>,
EventID:string,
PDID:string,
TimeS:bigint,
Timezone:int,
Version:float,
pii:struct<>>,
struct<
Data:struct<heading:string,loc1:string,latitude:double,longitude:double,loc2:string,speed:int>,
EventID:string,
PDID:string,
TimeS:bigint,
Timezone:int,
Version:float,
pii:struct<>>,
struct<
Data:struct<x:float,y:float,z:float>,
EventID:string,
PDID:string,
TimeS:bigint,
Timezone:int,
Version:float,
pii:struct<>>,
struct<
EventID:string,
Data:struct<value:int,unit:percentage>,
Version:float,
TimeS:bigint,
PDID:string,
Timezone:int>,
struct<
Data:struct<unit:string,value:int>,
EventID:string,
PDID:string,
TimeS:bigint,
Timezone:int,
Version:float,
pii:struct<>>
>
ROW FORMAT SERDE 
'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS TEXTFILE
LOCATION

'/ABC/XYZ/'; '/ ABC / XYZ /';

MismatchedTokenException(320!=313)
...
...
...
FAILED: ParseException line 11:10 mismatched input '<>' expecting < near 'struct' in struct type

Sample data 样本数据

{"uploadTimeStamp":"1486631318873","PDID":"123","data":[{"Data":{"unit":"rpm","value":"0"},"EventID":"E1","PDID":"123","Timestamp":1486631318873,"Timezone":330,"Version":"1.0","pii":{}},{"Data":{"heading":"N","loc3":"false","loc":"14.022425","loc1":"78.760587","loc4":"false","speed":"10"},"EventID":"E2","PDID":"123","Timestamp":1486631318873,"Timezone":330,"Version":"1.1","pii":{}},{"Data":{"x":"1.1","y":"1.2","z":"2.2"},"EventID":"E3","PDID":"123","Timestamp":1486631318873,"Timezone":330,"Version":"1.0","pii":{}},{"EventID":"E4","Data":{"value":"50","unit":"percentage"},"Version":"1.0","Timestamp":1486631318873,"PDID":"123","Timezone":330},{"Data":{"unit":"kmph","value":"70"},"EventID":"E5","PDID":"123","Timestamp":1486631318873,"Timezone":330,"Version":"1.0","pii":{}}]}

add jar /usr/lib/hive-hcatalog/share/hcatalog/hive-hcatalog-core.jar

create external table myjson
(
    uploadTimeStamp string
   ,PDID            string

   ,data            array
                    <
                        struct
                        <
                            Data:struct
                            <
                                unit:string
                               ,value:string
                               ,heading:string
                               ,loc3:string
                               ,loc:string
                               ,loc1:string
                               ,loc4:string
                               ,speed:string
                               ,x:string
                               ,y:string
                               ,z:string
                            >
                           ,EventID:string
                           ,PDID:string
                           ,`Timestamp`:bigint
                           ,Timezone:smallint
                           ,Version:string
                           ,pii:struct<dummy:string>
                        >
                    >
)
row format serde 'org.apache.hive.hcatalog.data.JsonSerDe' 
stored as textfile
location '/tmp/myjson'
;

select * from myjson
;

+------------------------+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| myjson.uploadtimestamp | myjson.pdid |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         myjson.data                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+------------------------+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|          1486631318873 |         123 | [{"data":{"unit":"rpm","value":"0","heading":null,"loc3":null,"loc":null,"loc1":null,"loc4":null,"speed":null,"x":null,"y":null,"z":null},"eventid":"E1","pdid":"123","timestamp":1486631318873,"timezone":330,"version":"1.0","pii":{"dummy":null}},{"data":{"unit":null,"value":null,"heading":"N","loc3":"false","loc":"14.022425","loc1":"78.760587","loc4":"false","speed":"10","x":null,"y":null,"z":null},"eventid":"E2","pdid":"123","timestamp":1486631318873,"timezone":330,"version":"1.1","pii":{"dummy":null}},{"data":{"unit":null,"value":null,"heading":null,"loc3":null,"loc":null,"loc1":null,"loc4":null,"speed":null,"x":"1.1","y":"1.2","z":"2.2"},"eventid":"E3","pdid":"123","timestamp":1486631318873,"timezone":330,"version":"1.0","pii":{"dummy":null}},{"data":{"unit":"percentage","value":"50","heading":null,"loc3":null,"loc":null,"loc1":null,"loc4":null,"speed":null,"x":null,"y":null,"z":null},"eventid":"E4","pdid":"123","timestamp":1486631318873,"timezone":330,"version":"1.0","pii":null},{"data":{"unit":"kmph","value":"70","heading":null,"loc3":null,"loc":null,"loc1":null,"loc4":null,"speed":null,"x":null,"y":null,"z":null},"eventid":"E5","pdid":"123","timestamp":1486631318873,"timezone":330,"version":"1.0","pii":{"dummy":null}}] |
+------------------------+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

select  j.uploadTimeStamp
       ,j.PDID

       ,d.val.EventID
       ,d.val.PDID
       ,d.val.`Timestamp`
       ,d.val.Timezone
       ,d.val.Version

       ,d.val.Data.unit
       ,d.val.Data.value
       ,d.val.Data.heading
       ,d.val.Data.loc3
       ,d.val.Data.loc
       ,d.val.Data.loc1
       ,d.val.Data.loc4
       ,d.val.Data.speed
       ,d.val.Data.x
       ,d.val.Data.y
       ,d.val.Data.z

from    myjson  j
        lateral view  explode (data) d as val
;            

+-------------------+--------+---------+------+---------------+----------+---------+------------+-------+---------+-------+-----------+-----------+-------+-------+------+------+------+
| j.uploadtimestamp | j.pdid | eventid | pdid |   timestamp   | timezone | version |    unit    | value | heading | loc3  |    loc    |   loc1    | loc4  | speed |  x   |  y   |  z   |
+-------------------+--------+---------+------+---------------+----------+---------+------------+-------+---------+-------+-----------+-----------+-------+-------+------+------+------+
|     1486631318873 |    123 | E1      |  123 | 1486631318873 |      330 | 1.0     | rpm        | 0     | NULL    | NULL  | NULL      | NULL      | NULL  | NULL  | NULL | NULL | NULL |
|     1486631318873 |    123 | E2      |  123 | 1486631318873 |      330 | 1.1     | NULL       | NULL  | N       | false | 14.022425 | 78.760587 | false | 10    | NULL | NULL | NULL |
|     1486631318873 |    123 | E3      |  123 | 1486631318873 |      330 | 1.0     | NULL       | NULL  | NULL    | NULL  | NULL      | NULL      | NULL  | NULL  | 1.1  | 1.2  | 2.2  |
|     1486631318873 |    123 | E4      |  123 | 1486631318873 |      330 | 1.0     | percentage | 50    | NULL    | NULL  | NULL      | NULL      | NULL  | NULL  | NULL | NULL | NULL |
|     1486631318873 |    123 | E5      |  123 | 1486631318873 |      330 | 1.0     | kmph       | 70    | NULL    | NULL  | NULL      | NULL      | NULL  | NULL  | NULL | NULL | NULL |
+-------------------+--------+---------+------+---------------+----------+---------+------------+-------+---------+-------+-----------+-----------+-------+-------+------+------+------+

Was having the same issue, then decided to create a custom input format which can extract the multiline(pretty print) json records. 遇到相同的问题,然后决定创建一种自定义输入格式,该格式可以提取多行(漂亮的打印)json记录。

This JsonRecordReader can read a multiline JSON record in Hive. 此JsonRecordReader可以在Hive中读取多行JSON记录。 It is extracting the record based on balancing of curly braces - { and }. 它基于大括号{和}的平衡提取记录。 So the content between first '{' to the balanced last '}' is considered as one complete record. 因此,第一个“ {”到平衡的最后一个“}”之间的内容被视为一个完整的记录。 Below is the code snippet: 下面是代码片段:

public static class JsonRecordReader implements RecordReader<LongWritable, Text> {

    public static final String START_TAG_KEY = "jsoninput.start";
    public static final String END_TAG_KEY = "jsoninput.end";

    private byte[] startTag = "{".getBytes();
    private byte[] endTag = "}".getBytes();
    private long start;
    private long end;
    private FSDataInputStream fsin;
    private final DataOutputBuffer buffer = new DataOutputBuffer();

    public JsonRecordReader(FileSplit split, JobConf jobConf) throws IOException {
        // uncomment the below lines if you need to get the configuration
        // from JobConf:
        // startTag = jobConf.get(START_TAG_KEY).getBytes("utf-8");
        // endTag = jobConf.get(END_TAG_KEY).getBytes("utf-8");

        // open the file and seek to the start of the split:
        start = split.getStart();
        end = start + split.getLength();
        Path file = split.getPath();
        FileSystem fs = file.getFileSystem(jobConf);
        fsin = fs.open(split.getPath());
        fsin.seek(start);
    }

    @Override
    public boolean next(LongWritable key, Text value) throws IOException {
        if (fsin.getPos() < end) {
            AtomicInteger count = new AtomicInteger(0);
            if (readUntilMatch(false, count)) {
                try {
                    buffer.write(startTag);
                    if (readUntilMatch(true, count)) {
                        key.set(fsin.getPos());
                        // create json record from buffer:
                        String jsonRecord = new String(buffer.getData(), 0, buffer.getLength());
                        value.set(jsonRecord);
                        return true;
                    }
                } finally {
                    buffer.reset();
                }
            }
        }
        return false;
    }

    @Override
    public LongWritable createKey() {
        return new LongWritable();
    }

    @Override
    public Text createValue() {
        return new Text();
    }

    @Override
    public long getPos() throws IOException {
        return fsin.getPos();
    }

    @Override
    public void close() throws IOException {
        fsin.close();
    }

    @Override
    public float getProgress() throws IOException {
        return ((fsin.getPos() - start) / (float) (end - start));
    }

    private boolean readUntilMatch(boolean withinBlock, AtomicInteger count) throws IOException {
        while (true) {
            int b = fsin.read();
            // end of file:
            if (b == -1)
                return false;

            // save to buffer:
            if (withinBlock)
                buffer.write(b);

            // check if we're matching start/end tag:
            if (b == startTag[0]) {
                count.incrementAndGet();
                if (!withinBlock) {
                    return true;
                }
            } else if (b == endTag[0]) {
                count.getAndDecrement();
                if (count.get() == 0) {
                    return true;
                }
            }

            // see if we've passed the stop point:
            if (!withinBlock && count.get() == 0 && fsin.getPos() >= end)
                return false;
        }
    }

}

This input format can be used along with the JSON Serde supplied by hive to read the multiline JSON file. 此输入格式可以与hive提供的JSON Serde一起使用,以读取多行JSON文件。

CREATE TABLE books (id string, bookname string, properties struct<subscription:string, unit:string>) ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' STORED AS INPUTFORMAT 'JsonInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';

The working code with samples is here: https://github.com/unayakdev/hive-json 带有示例的工作代码在这里: https : //github.com/unayakdev/hive-json

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM