[英]Big query query is too complex after pivot
假設我有下table
和興趣列表(cat, dog, music, soccer, coding)
| userId | user_interest | label |
| -------- | -------------- |----------|
| 12345 | cat | 1 |
| 12345 | dog | 1 |
| 6789 | music | 1 |
| 6789 | soccer | 1 |
我想將用戶興趣轉換為二進制數組(即二值化),結果表將類似於
| userId | labels |
| -------- | -------------- |
| 12345 | [1,1,0,0,0] |
| 6789 | [0,0,1,1,0] |
我可以用PIVOT
和ARRAY
來做到這一點,例如
WITH user_interest_pivot AS (
SELECT
*
FROM (
SELECT userId, user_interest, label FROM table
) AS T
PIVOT
(
MAX(label) FOR user_interestc IN ('cat', 'dog', 'music', 'soccer', 'coding')
) AS P
)
SELECT
userId,
ARRAY[IFNULL(cat,0), IFNULL(dog,0), IFNULL(music,0), IFNULL(soccer,0), IFNULL(coding,0)] AS labels,
FROM user_interea_pivot
但是,實際上我有很長的興趣列表,而 bigquery 中的上述方法似乎不起作用,因為
查詢執行期間資源超出:沒有足夠的資源用於查詢計划 - 子查詢太多或查詢太復雜
如果我能做些什么來處理這種情況,請幫助讓我知道。 謝謝!
根據您的真實數據,它仍然可能面臨資源問題,但值得嘗試沒有 PIVOT 的以下方法。
+----------+-----+-----------------+
| interest | idx | total_interests |
+----------+-----+-----------------+
| cat | 0 | 5 |
| dog | 1 | 5 |
| music | 2 | 5 |
| soccer | 3 | 5 |
| coding | 4 | 5 |
+----------+-----+-----------------+
SELECT userId, ARRAY_AGG(idx) user_interests
FROM sample_table t JOIN interests i ON t.user_interest = i.interest
GROUP BY 1
ARRAY(SELECT IF(ui IS NULL, 0, 1)
FROM UNNEST(GENERATE_ARRAY(0, total_interests - 1)) i
LEFT JOIN t.user_interests ui ON i = ui
ORDER BY i
) AS labels
CREATE TEMP TABLE sample_table AS
SELECT '12345' AS userId, 'cat' AS user_interest, 1 AS label UNION ALL
SELECT '12345' AS userId, 'dog' AS user_interest, 1 AS label UNION ALL
SELECT '6789' AS userId, 'music' AS user_interest, 1 AS label UNION ALL
SELECT '6789' AS userId, 'soccer' AS user_interest, 1 AS label;
CREATE TEMP TABLE interests AS
SELECT *, COUNT(1) OVER () AS total_interests
FROM UNNEST(['cat', 'dog', 'music', 'soccer', 'coding']) interest
WITH OFFSET idx
;
SELECT userId,
ARRAY(SELECT IF(ui IS NULL, 0, 1)
FROM UNNEST(GENERATE_ARRAY(0, total_interests - 1)) i
LEFT JOIN t.user_interests ui ON i = ui
ORDER BY i
) AS labels
FROM (
SELECT userId, total_interests, ARRAY_AGG(idx) user_interests
FROM sample_table t JOIN interests i ON t.user_interest = i.interest
GROUP BY 1, 2
) t;
我認為以下方法將“生存”任何[合理的]數據
create temp function base10to2(x float64) returns string
language js as r'return x.toString(2);';
with your_table as (
select '12345' as userid, 'cat' as user_interest, 1 as label union all
select '12345' as userid, 'dog' as user_interest, 1 as label union all
select '6789' as userid, 'music' as user_interest, 1 as label union all
select '6789' as userid, 'soccer' as user_interest, 1 as label
), interests as (
select *, pow(2, offset) weight, max(offset + 1) over() as len
from unnest(['cat', 'dog', 'music', 'soccer', 'coding']) user_interest
with offset
)
select userid,
split(rpad(reverse(base10to2(sum(weight))), any_value(len), '0'), '') labels,
from your_table
join interests
using(user_interest)
group by userid
與 output
create temp function base10to2(x float64) returns string
language js as r'return x.toString(2);';
with your_table as (
select '12345' as userid, 1 as user_interest, 1 as label union all
select '12345' as userid, 3 as user_interest, 1 as label union all
select '12345' as userid, 30 as user_interest, 1 as label union all
select '12345' as userid, 54 as user_interest, 1 as label
), interests as (
select *, pow(2, offset) weight, max(offset + 1) over() as len
from unnest(GENERATE_ARRAY(0, 55)) user_interest
with offset
)
select userid,
rpad(reverse(base10to2(sum(weight))), any_value(len), '0') labels,
from your_table
join interests
using(user_interest)
group by userid
;
查詢結果:
000 1 000000000000000000000000000 1 00000000000000000000000 1 0
但我認為應該是:
0 1 0 1 00000000000000000000000000 1 00000000000000000000000 1 0
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.