繁体   English   中英

使用带有破折号的数据源名称查询 Druid SQL 内部联接

[英]Query Druid SQL inner join with a dataSource name that has a dash

如何在两个数据源之间编写一个 INNER JOIN 查询,其中一个数据源的模式名称为破折号

对 Druid SQL 二进制文件执行以下查询会导致查询错误

SELECT * 
FROM first 
INNER JOIN "second-schema" on first.device_id = "second-schema".device_id;

org.apache.druid.java.util.common.ISE:无法构建查询计划

尝试引用名称中包含破折号的数据源时,这是正确的语法吗?

架构

[
  {
    "dataSchema": {
      "dataSource": "second-schema",
      "parser": {
        "type": "string",
        "parseSpec": {
          "format": "json",
          "timestampSpec": {
            "column": "ts_start"
          },
          "dimensionsSpec": {
            "dimensions": [
              "etid",
              "device_id",
              "device_name",
              "x_1",
              "x_2",
              "x_3",
              "vlan",
              "s_x",
              "d_x",
              "d_p",
              "msg_type"
            ],
            "dimensionExclusions": [],
            "spatialDimensions": []
          }
        }
      },
      "metricsSpec": [
        { "type": "hyperUnique", "name": "conn_id_hll", "fieldName": "conn_id"},
        {
          "type": "count",
          "name": "event_count"
        }
      ],
      "granularitySpec": {
        "type": "uniform",
        "segmentGranularity": "HOUR",
        "queryGranularity": "minute"
      }
    },
    "ioConfig": {
      "type": "realtime",
      "firehose": {
        "type": "kafka-0.8",
        "consumerProps": {
          "zookeeper.connect": "localhost:2181",
          "zookeeper.connectiontimeout.ms": "15000",
          "zookeeper.sessiontimeout.ms": "15000",
          "zookeeper.synctime.ms": "5000",
          "group.id": "flow-info",
          "fetch.size": "1048586",
          "autooffset.reset": "largest",
          "autocommit.enable": "false"
        },
        "feed": "flow-info"
      },
      "plumber": {
        "type": "realtime"
      }
    },
    "tuningConfig": {
      "type": "realtime",
      "maxRowsInMemory": 50000,
      "basePersistDirectory": "\/opt\/druid-data\/realtime\/basePersist",
      "intermediatePersistPeriod": "PT10m",
      "windowPeriod": "PT15m",
      "rejectionPolicy": {
        "type": "serverTime"
      }
    }
  },
  {
    "dataSchema": {
      "dataSource": "first",
      "parser": {
        "type": "string",
        "parseSpec": {
          "format": "json",
          "timestampSpec": {
            "column": "ts_start"
          },
          "dimensionsSpec": {
            "dimensions": [
              "etid",
              "category",
              "device_id",
              "device_name",
              "severity",
              "x_2",
              "x_3",
              "x_4",
              "x_5",
              "vlan",
              "s_x",
              "d_x",
              "s_i",
              "d_i",
              "d_p",
              "id"
            ],
            "dimensionExclusions": [],
            "spatialDimensions": []
          }
        }
      },
      "metricsSpec": [
        { "type": "doubleSum",  "name": "val_num",      "fieldName": "val_num" },
        { "type": "doubleMin",  "name": "val_num_min",  "fieldName": "val_num" },
        { "type": "doubleMax",  "name": "val_num_max",  "fieldName": "val_num" },
        { "type": "doubleSum",  "name": "size",         "fieldName": "size" },
        { "type": "doubleMin",  "name": "size_min",     "fieldName": "size" },
        { "type": "doubleMax",  "name": "size_max",     "fieldName": "size" },
        { "type": "count", "name": "first_count" }
      ],
      "granularitySpec": {
        "type": "uniform",
        "segmentGranularity": "HOUR",
        "queryGranularity": "minute"
      }
    },
    "ioConfig": {
      "type": "realtime",
      "firehose": {
        "type": "kafka-0.8",
        "consumerProps": {
          "zookeeper.connect": "localhost:2181",
          "zookeeper.connectiontimeout.ms": "15000",
          "zookeeper.sessiontimeout.ms": "15000",
          "zookeeper.synctime.ms": "5000",
          "group.id": "first",
          "fetch.size": "1048586",
          "autooffset.reset": "largest",
          "autocommit.enable": "false"
        },
        "feed": "first"
      },
      "plumber": {
        "type": "realtime"
      }
    },
    "tuningConfig": {
      "type": "realtime",
      "maxRowsInMemory": 50000,
      "basePersistDirectory": "\/opt\/druid-data\/realtime\/basePersist",
      "intermediatePersistPeriod": "PT10m",
      "windowPeriod": "PT15m",
      "rejectionPolicy": {
        "type": "serverTime"
      }
    }
  }
]

根据您的架构定义,我将进行一些观察。

  1. 在进行连接时,您通常必须明确列出列(不使用 *),否则您会从重复列中得到冲突。 例如,在您的加入中,您在“第一”和“第二模式”中都有一个 device_id,更不用说在两者中相同的所有其他列。

  2. 使用文字分隔符时,我不会混淆它们。 我要么使用它们,要么不使用它们。

所以我认为你的查询将以更像这样的形式更好地工作

 SELECT
    "first"."etid",
    "first"."category",
    "first"."device_id",
    "first"."device_name",
    "first"."severity",
    "first"."x_2",
    "first"."x_3",
    "first"."x_4",
    "first"."x_5",
    "first"."vlan",
    "first"."s_x",
    "first"."d_x",
    "first"."s_i",
    "first"."d_i",
    "first"."d_p",
    "first"."id",
    "second-schema"."etid" as "ss_etid",
    "second-schema"."device_id" as "ss_device_id",
    "second-schema"."device_name" as "ss_device_name",
    "second-schema"."x_1" as "ss_x_1",
    "second-schema"."x_2" as "ss_x_2",
    "second-schema"."x_3" as "ss_x_3",
    "second-schema"."vlan" as "ss_vlan",
    "second-schema"."s_x" as "ss_s_x",
    "second-schema"."d_x" as "ss_d_x",
    "second-schema"."d_p" as "ss_d_p",
    "second-schema"."msg_type"
 FROM "first"
 INNER JOIN "second-schema" ON "first"."device_id" = "second-schema"."device_id";

显然,您可以随意命名列,或者根据需要包含排除列。 Select * 仅在两个表中的所有列都是唯一的情况下才有效。

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM