使用带有破折号的数据源名称查询 Druid SQL 内部联接

Question

如何在两个数据源之间编写一个 INNER JOIN 查询，其中一个数据源的模式名称为破折号

对 Druid SQL 二进制文件执行以下查询会导致查询错误

SELECT * 
FROM first 
INNER JOIN "second-schema" on first.device_id = "second-schema".device_id;

org.apache.druid.java.util.common.ISE：无法构建查询计划

尝试引用名称中包含破折号的数据源时，这是正确的语法吗？

架构

[
  {
    "dataSchema": {
      "dataSource": "second-schema",
      "parser": {
        "type": "string",
        "parseSpec": {
          "format": "json",
          "timestampSpec": {
            "column": "ts_start"
          },
          "dimensionsSpec": {
            "dimensions": [
              "etid",
              "device_id",
              "device_name",
              "x_1",
              "x_2",
              "x_3",
              "vlan",
              "s_x",
              "d_x",
              "d_p",
              "msg_type"
            ],
            "dimensionExclusions": [],
            "spatialDimensions": []
          }
        }
      },
      "metricsSpec": [
        { "type": "hyperUnique", "name": "conn_id_hll", "fieldName": "conn_id"},
        {
          "type": "count",
          "name": "event_count"
        }
      ],
      "granularitySpec": {
        "type": "uniform",
        "segmentGranularity": "HOUR",
        "queryGranularity": "minute"
      }
    },
    "ioConfig": {
      "type": "realtime",
      "firehose": {
        "type": "kafka-0.8",
        "consumerProps": {
          "zookeeper.connect": "localhost:2181",
          "zookeeper.connectiontimeout.ms": "15000",
          "zookeeper.sessiontimeout.ms": "15000",
          "zookeeper.synctime.ms": "5000",
          "group.id": "flow-info",
          "fetch.size": "1048586",
          "autooffset.reset": "largest",
          "autocommit.enable": "false"
        },
        "feed": "flow-info"
      },
      "plumber": {
        "type": "realtime"
      }
    },
    "tuningConfig": {
      "type": "realtime",
      "maxRowsInMemory": 50000,
      "basePersistDirectory": "\/opt\/druid-data\/realtime\/basePersist",
      "intermediatePersistPeriod": "PT10m",
      "windowPeriod": "PT15m",
      "rejectionPolicy": {
        "type": "serverTime"
      }
    }
  },
  {
    "dataSchema": {
      "dataSource": "first",
      "parser": {
        "type": "string",
        "parseSpec": {
          "format": "json",
          "timestampSpec": {
            "column": "ts_start"
          },
          "dimensionsSpec": {
            "dimensions": [
              "etid",
              "category",
              "device_id",
              "device_name",
              "severity",
              "x_2",
              "x_3",
              "x_4",
              "x_5",
              "vlan",
              "s_x",
              "d_x",
              "s_i",
              "d_i",
              "d_p",
              "id"
            ],
            "dimensionExclusions": [],
            "spatialDimensions": []
          }
        }
      },
      "metricsSpec": [
        { "type": "doubleSum",  "name": "val_num",      "fieldName": "val_num" },
        { "type": "doubleMin",  "name": "val_num_min",  "fieldName": "val_num" },
        { "type": "doubleMax",  "name": "val_num_max",  "fieldName": "val_num" },
        { "type": "doubleSum",  "name": "size",         "fieldName": "size" },
        { "type": "doubleMin",  "name": "size_min",     "fieldName": "size" },
        { "type": "doubleMax",  "name": "size_max",     "fieldName": "size" },
        { "type": "count", "name": "first_count" }
      ],
      "granularitySpec": {
        "type": "uniform",
        "segmentGranularity": "HOUR",
        "queryGranularity": "minute"
      }
    },
    "ioConfig": {
      "type": "realtime",
      "firehose": {
        "type": "kafka-0.8",
        "consumerProps": {
          "zookeeper.connect": "localhost:2181",
          "zookeeper.connectiontimeout.ms": "15000",
          "zookeeper.sessiontimeout.ms": "15000",
          "zookeeper.synctime.ms": "5000",
          "group.id": "first",
          "fetch.size": "1048586",
          "autooffset.reset": "largest",
          "autocommit.enable": "false"
        },
        "feed": "first"
      },
      "plumber": {
        "type": "realtime"
      }
    },
    "tuningConfig": {
      "type": "realtime",
      "maxRowsInMemory": 50000,
      "basePersistDirectory": "\/opt\/druid-data\/realtime\/basePersist",
      "intermediatePersistPeriod": "PT10m",
      "windowPeriod": "PT15m",
      "rejectionPolicy": {
        "type": "serverTime"
      }
    }
  }
]

Answer 1

根据您的架构定义，我将进行一些观察。

在进行连接时，您通常必须明确列出列（不使用 *），否则您会从重复列中得到冲突。 例如，在您的加入中，您在“第一”和“第二模式”中都有一个 device_id，更不用说在两者中相同的所有其他列。
使用文字分隔符时，我不会混淆它们。 我要么使用它们，要么不使用它们。

所以我认为你的查询将以更像这样的形式更好地工作

 SELECT
    "first"."etid",
    "first"."category",
    "first"."device_id",
    "first"."device_name",
    "first"."severity",
    "first"."x_2",
    "first"."x_3",
    "first"."x_4",
    "first"."x_5",
    "first"."vlan",
    "first"."s_x",
    "first"."d_x",
    "first"."s_i",
    "first"."d_i",
    "first"."d_p",
    "first"."id",
    "second-schema"."etid" as "ss_etid",
    "second-schema"."device_id" as "ss_device_id",
    "second-schema"."device_name" as "ss_device_name",
    "second-schema"."x_1" as "ss_x_1",
    "second-schema"."x_2" as "ss_x_2",
    "second-schema"."x_3" as "ss_x_3",
    "second-schema"."vlan" as "ss_vlan",
    "second-schema"."s_x" as "ss_s_x",
    "second-schema"."d_x" as "ss_d_x",
    "second-schema"."d_p" as "ss_d_p",
    "second-schema"."msg_type"
 FROM "first"
 INNER JOIN "second-schema" ON "first"."device_id" = "second-schema"."device_id";

显然，您可以随意命名列，或者根据需要包含排除列。 Select * 仅在两个表中的所有列都是唯一的情况下才有效。

使用带有破折号的数据源名称查询 Druid SQL 内部联接

问题描述

1 个解决方案

解决方案1
2 已采纳 2020-08-10 19:50:28

使用带有破折号的数据源名称查询 Druid SQL 内部联接

问题描述

1 个解决方案

解决方案1 2 已采纳 2020-08-10 19:50:28

解决方案1
2 已采纳 2020-08-10 19:50:28