SQL rolling window unique count

Question

Is there an SQL equivalent to this line of code in pandas?

Assuming a is a DataFrame object and the index is a list of times (with hours, minutes, and seconds).

x in this case would just be the other column in the DataFrame besides the index.

a.rolling('1h').apply(lambda x: len(np.unique(x))).astype(int)

Sample result: (Time formatted in HH:MM:SS)

                X
05:20:19        4   <- 1 (only 1 unique number)
05:20:19        5   <- 2 (4 and 5 are unique) * same time as before
05:37:18        7   <- 3 (4, 5 and 7 are unique)
05:45:14        4   <- 3 (4, 5, and 7)
05:56:04        4   <- 3 (4, 5, and 7)
06:18:48        6   <- 4 (now 4, 5, 6, and 7)
06:48:34        3   <- 3 (only checks past hour, so now 3, 4, 6)
07:52:48        1   <- 1 (only time in past hour, so only 1)

I'm just using vanilla SQL for this as well.

Thanks so much!

Answer 1

Below example is for BigQuery Standard SQL

#standardSQL
WITH `project.dataset.your_table` AS (
  SELECT TIME '05:20:19' t, 4 x UNION ALL
  SELECT TIME '05:37:18', 7 UNION ALL
  SELECT TIME '05:45:14', 4 UNION ALL
  SELECT TIME '05:56:04', 4 UNION ALL
  SELECT TIME '06:18:48', 5 UNION ALL
  SELECT TIME '06:48:34', 3 UNION ALL
  SELECT TIME '07:52:48', 1 
)
SELECT
  t, x, (SELECT COUNT(DISTINCT y) FROM UNNEST(arr) y) uniques
FROM (
  SELECT t, x,
    ARRAY_AGG(x) 
      OVER(ORDER BY TIME_DIFF(t, TIME '00:00:00', SECOND) 
      RANGE BETWEEN 3600 PRECEDING AND CURRENT ROW) arr
  FROM `project.dataset.your_table`
)
-- ORDER BY t

with result as

Row t           x   uniques  
1   05:20:19    4   1    
2   05:37:18    7   2    
3   05:45:14    4   2    
4   05:56:04    4   2    
5   06:18:48    5   3    
6   06:48:34    3   3    
7   07:52:48    1   1

it uses exact dummy data from your question - I feel in reality you don't have TIME but rather TIMESTAMP so instead of ORDER BY TIME_DIFF(t, TIME '00:00:00', SECOND) you might want to use something like ORDER BY TIMESTAMP_DIFF(t, TIMESTAMP '2000-01-01 00:00:00', SECOND) so your query will be like below

#standardSQL
WITH `project.dataset.your_table` AS (
  SELECT TIMESTAMP '2018-01-05 05:20:19' t, 4 x UNION ALL
  SELECT TIMESTAMP '2018-01-05 05:37:18', 7 UNION ALL
  SELECT TIMESTAMP '2018-01-05 05:45:14', 4 UNION ALL
  SELECT TIMESTAMP '2018-01-05 05:56:04', 4 UNION ALL
  SELECT TIMESTAMP '2018-01-05 06:18:48', 5 UNION ALL
  SELECT TIMESTAMP '2018-01-05 06:48:34', 3 UNION ALL
  SELECT TIMESTAMP '2018-01-05 07:52:48', 1 
)
SELECT
  t, x, (SELECT COUNT(DISTINCT y) FROM UNNEST(arr) y) uniques
FROM (
  SELECT t, x,
    ARRAY_AGG(x) 
      OVER(ORDER BY TIMESTAMP_DIFF(t, TIMESTAMP '2000-01-01 00:00:00', SECOND) 
      RANGE BETWEEN 3600 PRECEDING AND CURRENT ROW) arr
  FROM `project.dataset.your_table`
)
-- ORDER BY t

with result as

Row t                           x   uniques  
1   2018-01-05 05:20:19.000 UTC 4   1    
2   2018-01-05 05:37:18.000 UTC 7   2    
3   2018-01-05 05:45:14.000 UTC 4   2    
4   2018-01-05 05:56:04.000 UTC 4   2    
5   2018-01-05 06:18:48.000 UTC 5   3    
6   2018-01-05 06:48:34.000 UTC 3   3    
7   2018-01-05 07:52:48.000 UTC 1   1

Update - Below is "trick" to address your extra new requirement

#standardSQL
WITH `project.dataset.your_table` AS (
  SELECT TIME '05:20:19' t, 4 x UNION ALL
  SELECT TIME '05:20:19', 5 UNION ALL
  SELECT TIME '05:37:18', 7 UNION ALL
  SELECT TIME '05:45:14', 4 UNION ALL
  SELECT TIME '05:56:04', 4 UNION ALL
  SELECT TIME '06:18:48', 6 UNION ALL
  SELECT TIME '06:48:34', 3 UNION ALL
  SELECT TIME '07:52:48', 1   
)
SELECT
  t, x, (SELECT COUNT(DISTINCT y) FROM UNNEST(arr) y) uniques
FROM (
  SELECT t, x,
    ARRAY_AGG(x) 
      OVER(ORDER BY TIME_DIFF(t, TIME '00:00:00', MILLISECOND) + 1000 * RAND() 
      RANGE BETWEEN 3600000 PRECEDING AND CURRENT ROW) arr
  FROM `project.dataset.your_table`
)
-- ORDER BY t

with result as

Row t           x   uniques  
1   05:20:19    5   1    
2   05:20:19    4   2    
3   05:37:18    7   3    
4   05:45:14    4   3    
5   05:56:04    4   3    
6   06:18:48    6   4    
7   06:48:34    3   3    
8   07:52:48    1   1

One more update :o)

#standardSQL
WITH `project.dataset.your_table` AS (
  SELECT TIME '05:20:19' t, 4 x UNION ALL
  SELECT TIME '05:20:19', 5 UNION ALL
  SELECT TIME '05:37:18', 7 UNION ALL
  SELECT TIME '05:45:14', 4 UNION ALL
  SELECT TIME '05:56:04', 4 UNION ALL
  SELECT TIME '06:18:48', 6 UNION ALL
  SELECT TIME '06:48:34', 3 UNION ALL
  SELECT TIME '07:52:48', 1   
)
SELECT
  t, x, (SELECT COUNT(DISTINCT y) FROM UNNEST(arr) y) uniques
FROM (
  SELECT t, x,
    ARRAY_AGG(x) 
      OVER(ORDER BY ms 
      RANGE BETWEEN 3600000 PRECEDING AND CURRENT ROW) arr
  FROM (
    SELECT t, x, TIME_DIFF(t, TIME '00:00:00', MILLISECOND) + 1000 * RAND() ms
    FROM `project.dataset.your_table`
  )
)
-- ORDER BY t

Answer 2

Join the table with itself, using the time range relationship as the joining condition. Here's the MySQL syntax:

SELECT t1.time, t1.x, COUNT(DISTINCT t2.x)
FROM yourTable AS t1
JOIN yourTable AS t2 ON t2.time BETWEEN DATE_SUB(t1.time, INTERVAL 1 HOUR) AND t1.time
GROUP BY t1.time, t1.x

DEMO

Answer 3

For MySQL, you can use a sub-query, see below:

SELECT t1.date
,      (SELECT count(DISTINCT t2.x) FROM mytable AS t2
           WHERE  t2.date <= t1.date
           AND    t2.date > DATE_SUB(t1.date, INTERVAL 1 HOUR)    
       ) AS uniq_rolling_count_of_x 
FROM mytable AS t1
ORDER BY 1
;

SQL rolling window unique count

Question

3 answers

solution1
2 ACCPTED 2018-05-04 03:32:48

solution2
0 2018-05-04 00:35:15

solution3
0 2018-05-04 01:48:11

SQL rolling window unique count

Question

3 answers

solution1 2 ACCPTED 2018-05-04 03:32:48

solution2 0 2018-05-04 00:35:15

solution3 0 2018-05-04 01:48:11

solution1
2 ACCPTED 2018-05-04 03:32:48

solution2
0 2018-05-04 00:35:15

solution3
0 2018-05-04 01:48:11