[英]Snowflake - Getting a Count of Distinct Users While Using Window Frame or an Order
我正在尝试编写一个查询来获取一个月内的累积用户数。
WITH USERS_PER_DAY AS (
SELECT
DATE_TRUNC('day', HOUR_DIM.UTC) DAY
, COUNT(DISTINCT CLIENT_SID) ACTIVE_USER_COUNT
FROM RPT.S_HOURLY_INACTIVE_TVS_AGG
WHERE DATEDIFF('month', HOUR_DIM.UTC, CURRENT_DATE) BETWEEN 0 AND 0
GROUP BY
DATE_TRUNC('day', HOUR_DIM.UTC)
)
SELECT
DAY,
SUM(ACTIVE_USER_COUNT) OVER (PARTITION BY APP_NAME ORDER BY DAY ASC rows between unbounded preceding and current row) CUMULATIVE_ACTIVE_USER_ACOUNT
FROM USERS_PER_DAY
输出现在如下所示:
问题是我需要计算本月不同或唯一用户的数量,但此查询包含天之间用户的重复。 我知道我不能在我的窗口函数中使用 count(distinct ...) 但是还有另一种方法可以确保我在几天之间没有重复用户吗?
因此,一个天真的解决方案是将数据转换为不同的日期和每天不同的用户,然后将它们加入 CTE 以获得结果:
WITH data AS (
select
hour_dim_utc::timestamp_ntz as hour_dim_utc
,user_id
from values
('2020-03-10 9:50', 1 ),
('2020-03-10 9:51', 3 ),
('2020-03-10 10:51', 3 ),
('2020-03-11 9:52', 1 ),
('2020-03-11 9:53', 2 ),
('2020-03-11 9:54', 0 ),
('2020-03-12 9:55', 0 ),
('2020-03-12 9:56', 1 ),
('2020-03-12 9:57', 3 ),
('2020-03-14 9:58', 2 ),
('2020-03-15 9:59', 3 ),
('2020-03-16 10:00', 2 ),
('2020-03-17 10:01', 2 ),
('2020-03-18 10:02', 0 ),
('2020-03-19 10:04', 11 )
s( hour_dim_utc, user_id)
), distinct_users_days AS (
select distinct
hour_dim_utc::date as day
,user_id
from data
), distinct_days AS (
select distinct
hour_dim_utc::date as day
from data
)
select
a.day
,count(distinct(u.user_id)) as acum_count
from distinct_days as a
join distinct_users_days as u on u.day <= a.day
group by 1 order by 1;
给出:
DAY ACUM_COUNT
2020-03-10 2
2020-03-11 4
2020-03-12 4
2020-03-14 4
2020-03-15 4
2020-03-16 4
2020-03-17 4
2020-03-18 4
2020-03-19 5
在您的 SQL 中,您执行WHERE DATEDIFF('month', HOUR_DIM.UTC, CURRENT_DATE) BETWEEN 0 AND 0
说WHERE hour_dim.utc >= DATE_TRUNC('month', CURRENT_DATE)
会更具可读性和性能
对此的“聪明”方法是使用dense_rank()
的总和:
SELECT first_day, APP_NAME,
SUM(COUNT(*)) OVER (PARTITION BY APP_NAME ORDER BY first_day ASC) as CUMULATIVE_ACTIVE_USER_ACOUNT
FROM (SELECT CLIENT_SID, APP_NAME,
MIN(DATE_TRUNC('day', HOUR_DIM.UTC)) as first_day
FROM RPT.S_HOURLY_INACTIVE_TVS_AGG
WHERE DATEDIFF('month', HOUR_DIM.UTC, CURRENT_DATE) BETWEEN 0 AND 0
GROUP BY CLIENT_SID, APP_NAME
) cs
GROUP BY first_day, APP_NAME;
如果您每天都有足够的数据,那么 Gordon 的更新答案很好,可以让用户在该月的每一天都有第一天,但是当数据像我的示例数据一样稀疏时,您不会得到预期的结果
戈登的代码实际上是这样的:
WITH data AS (
select hour_dim_utc::timestamp_ntz as hour_dim_utc, user_id from values
('2020-03-10 9:50', 1 ),
('2020-03-10 9:51', 3 ),
('2020-03-10 10:51', 3 ),
('2020-03-11 9:52', 1 ),
('2020-03-11 9:53', 2 ),
('2020-03-11 9:54', 0 ),
('2020-03-12 9:55', 0 ),
('2020-03-12 9:56', 1 ),
('2020-03-12 9:57', 3 ),
('2020-03-14 9:58', 2 ),
('2020-03-15 9:59', 3 ),
('2020-03-16 10:00', 2 ),
('2020-03-17 10:01', 2 ),
('2020-03-18 10:02', 0 ),
('2020-03-19 10:04', 11 )
s( hour_dim_utc, user_id)
)
select
first_day
,sum(count(*)) over (ORDER BY first_day ASC) as acum
from (
select user_id
,min(hour_dim_utc::date) as first_day
from data
group by 1
) group by 1;
这使:
FIRST_DAY ACUM
2020-03-10 2
2020-03-11 4
2020-03-19 5
我知道这是旧的,但希望这会帮助任何寻找类似东西的人。
如果您查看 OP 的最后一篇文章,则没有 3 月 13 日。 正如西蒙所说,他的数据很少。 要每天输入一个条目,请创建一个日期脊椎。 使用上一篇文章中的 SQL,我调用了一个每天都有一个条目的表(我在下面的示例中将其称为 DATE_KEY)。 由于这些表往往很远或很远,因此我查询了初始数据集的 min() 和 max() 值,以限制从日期表返回的行。
我在查询中保留了 first_day 字段,但已将其注释掉,以便您可以取消注释以查看日期脊椎与从数据集返回的日期之间的关系。
WITH
dates AS (
SELECT DATE_KEY
FROM my_date_table
)
,data AS (
select hour_dim_utc::timestamp_ntz as hour_dim_utc, user_id from values
('2020-03-10 9:50', 1 ),
('2020-03-10 9:51', 3 ),
('2020-03-10 10:51', 3 ),
('2020-03-11 9:52', 1 ),
('2020-03-11 9:53', 2 ),
('2020-03-11 9:54', 0 ),
('2020-03-12 9:55', 0 ),
('2020-03-12 9:56', 1 ),
('2020-03-12 9:57', 3 ),
('2020-03-14 9:58', 2 ),
('2020-03-15 9:59', 3 ),
('2020-03-16 10:00', 2 ),
('2020-03-17 10:01', 2 ),
('2020-03-18 10:02', 0 ),
('2020-03-19 10:04', 11 )
s( hour_dim_utc, user_id)
)
,RANGES as (
SELECT
min(hour_dim_utc::date) AS min_day
,max(hour_dim_utc::date) AS max_day
FROM data
)
, first_days AS (
select
first_day
,sum(count(*)) over (ORDER BY first_day ASC) as acum
from (
select user_id
,min(hour_dim_utc::date) as first_day
from data
group by 1
) group by 1
)
SELECT
D.DATE_KEY
-- ,FD.FIRST_DAY
,sum(FD.ACUM) over (ORDER BY DATE_KEY ASC) AS ACUM
FROM DATES D
inner join ranges ON d.date_key >= ranges.min_day and d.date_key <= ranges.max_day
LEFT JOIN FIRST_DAYS FD ON D.DATE_KEY = FD.FIRST_DAY
这导致
+------------+------+
| DATE_KEY | ACUM |
+------------+------+
| 2020-03-10 | 2 |
| 2020-03-11 | 6 |
| 2020-03-12 | 6 |
| 2020-03-13 | 6 |
| 2020-03-14 | 6 |
| 2020-03-15 | 6 |
| 2020-03-16 | 6 |
| 2020-03-17 | 6 |
| 2020-03-18 | 6 |
| 2020-03-19 | 11 |
+------------+------+
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.