Query Druid SQL inner join with a dataSource name that has a dash

Question

How to write an INNER JOIN query between two data sources that one of them has a dash as it's schema name

Executing the following query on the Druid SQL binary results in a query error

SELECT * 
FROM first 
INNER JOIN "second-schema" on first.device_id = "second-schema".device_id;

org.apache.druid.java.util.common.ISE: Cannot build plan for query

Is this the correct syntax when trying to refrence a data source that has a dash in it's name?

Schema

[
  {
    "dataSchema": {
      "dataSource": "second-schema",
      "parser": {
        "type": "string",
        "parseSpec": {
          "format": "json",
          "timestampSpec": {
            "column": "ts_start"
          },
          "dimensionsSpec": {
            "dimensions": [
              "etid",
              "device_id",
              "device_name",
              "x_1",
              "x_2",
              "x_3",
              "vlan",
              "s_x",
              "d_x",
              "d_p",
              "msg_type"
            ],
            "dimensionExclusions": [],
            "spatialDimensions": []
          }
        }
      },
      "metricsSpec": [
        { "type": "hyperUnique", "name": "conn_id_hll", "fieldName": "conn_id"},
        {
          "type": "count",
          "name": "event_count"
        }
      ],
      "granularitySpec": {
        "type": "uniform",
        "segmentGranularity": "HOUR",
        "queryGranularity": "minute"
      }
    },
    "ioConfig": {
      "type": "realtime",
      "firehose": {
        "type": "kafka-0.8",
        "consumerProps": {
          "zookeeper.connect": "localhost:2181",
          "zookeeper.connectiontimeout.ms": "15000",
          "zookeeper.sessiontimeout.ms": "15000",
          "zookeeper.synctime.ms": "5000",
          "group.id": "flow-info",
          "fetch.size": "1048586",
          "autooffset.reset": "largest",
          "autocommit.enable": "false"
        },
        "feed": "flow-info"
      },
      "plumber": {
        "type": "realtime"
      }
    },
    "tuningConfig": {
      "type": "realtime",
      "maxRowsInMemory": 50000,
      "basePersistDirectory": "\/opt\/druid-data\/realtime\/basePersist",
      "intermediatePersistPeriod": "PT10m",
      "windowPeriod": "PT15m",
      "rejectionPolicy": {
        "type": "serverTime"
      }
    }
  },
  {
    "dataSchema": {
      "dataSource": "first",
      "parser": {
        "type": "string",
        "parseSpec": {
          "format": "json",
          "timestampSpec": {
            "column": "ts_start"
          },
          "dimensionsSpec": {
            "dimensions": [
              "etid",
              "category",
              "device_id",
              "device_name",
              "severity",
              "x_2",
              "x_3",
              "x_4",
              "x_5",
              "vlan",
              "s_x",
              "d_x",
              "s_i",
              "d_i",
              "d_p",
              "id"
            ],
            "dimensionExclusions": [],
            "spatialDimensions": []
          }
        }
      },
      "metricsSpec": [
        { "type": "doubleSum",  "name": "val_num",      "fieldName": "val_num" },
        { "type": "doubleMin",  "name": "val_num_min",  "fieldName": "val_num" },
        { "type": "doubleMax",  "name": "val_num_max",  "fieldName": "val_num" },
        { "type": "doubleSum",  "name": "size",         "fieldName": "size" },
        { "type": "doubleMin",  "name": "size_min",     "fieldName": "size" },
        { "type": "doubleMax",  "name": "size_max",     "fieldName": "size" },
        { "type": "count", "name": "first_count" }
      ],
      "granularitySpec": {
        "type": "uniform",
        "segmentGranularity": "HOUR",
        "queryGranularity": "minute"
      }
    },
    "ioConfig": {
      "type": "realtime",
      "firehose": {
        "type": "kafka-0.8",
        "consumerProps": {
          "zookeeper.connect": "localhost:2181",
          "zookeeper.connectiontimeout.ms": "15000",
          "zookeeper.sessiontimeout.ms": "15000",
          "zookeeper.synctime.ms": "5000",
          "group.id": "first",
          "fetch.size": "1048586",
          "autooffset.reset": "largest",
          "autocommit.enable": "false"
        },
        "feed": "first"
      },
      "plumber": {
        "type": "realtime"
      }
    },
    "tuningConfig": {
      "type": "realtime",
      "maxRowsInMemory": 50000,
      "basePersistDirectory": "\/opt\/druid-data\/realtime\/basePersist",
      "intermediatePersistPeriod": "PT10m",
      "windowPeriod": "PT15m",
      "rejectionPolicy": {
        "type": "serverTime"
      }
    }
  }
]

Answer 1

Based on your schema definitions there are a few observations I'll make.

When doing a join you usually have to list out columns explicitly (not use a *) otherwise you get collisions from duplicate columns. In your join, for example, you have a device_id in both "first" and "second-schema", not to mention all the other columns that are the same across both.
When using a literal delimiter I don't mix them up. I either use them or I don't.

So I think your query will work better in the form of something more like this

 SELECT
    "first"."etid",
    "first"."category",
    "first"."device_id",
    "first"."device_name",
    "first"."severity",
    "first"."x_2",
    "first"."x_3",
    "first"."x_4",
    "first"."x_5",
    "first"."vlan",
    "first"."s_x",
    "first"."d_x",
    "first"."s_i",
    "first"."d_i",
    "first"."d_p",
    "first"."id",
    "second-schema"."etid" as "ss_etid",
    "second-schema"."device_id" as "ss_device_id",
    "second-schema"."device_name" as "ss_device_name",
    "second-schema"."x_1" as "ss_x_1",
    "second-schema"."x_2" as "ss_x_2",
    "second-schema"."x_3" as "ss_x_3",
    "second-schema"."vlan" as "ss_vlan",
    "second-schema"."s_x" as "ss_s_x",
    "second-schema"."d_x" as "ss_d_x",
    "second-schema"."d_p" as "ss_d_p",
    "second-schema"."msg_type"
 FROM "first"
 INNER JOIN "second-schema" ON "first"."device_id" = "second-schema"."device_id";

Obviously feel free to name columns as you see fit, or include exclude columns as needed. Select * will only work when all columns across both tables are unique.

Query Druid SQL inner join with a dataSource name that has a dash

Question

1 answers

solution1
2 ACCPTED 2020-08-10 19:50:28

Query Druid SQL inner join with a dataSource name that has a dash

Question

1 answers

solution1 2 ACCPTED 2020-08-10 19:50:28

solution1
2 ACCPTED 2020-08-10 19:50:28