簡體   English   中英

使用 {dbplyr} 按組變異時如何使用 `last()`?

[英]How to use `last()` when mutating by group with {dbplyr}?

考慮以下遠程表:

library(dbplyr)
library(dplyr, w = F)
remote_data <- memdb_frame(
  grp = c(2, 2, 2, 1, 3, 1, 1),
  win = c("B", "C", "A", "B", "C", "A", "C"),
  id = c(1,3,5,7,2,4,6),
)

我希望按grp分組,按win排序並取最后一個 id。 如果我先收集,這相當簡單

# intended output when collecting first
remote_data %>% 
  collect() %>% 
  arrange(grp, win) %>% 
  group_by(grp) %>% 
  mutate(last_id = last(id)) %>% 
  ungroup()
#> # A tibble: 7 × 4
#>     grp win      id last_id
#>   <dbl> <chr> <dbl>   <dbl>
#> 1     1 A         4       6
#> 2     1 B         7       6
#> 3     1 C         6       6
#> 4     2 A         5       3
#> 5     2 B         1       3
#> 6     2 C         3       3
#> 7     3 C         2       2

但是,我無法通過刪除collect()將其直接轉換為 {dbplyr} 代碼,盡管 SQL 代碼看起來還不錯,但這里發生了什么?

remote_data %>% 
  arrange(grp, win) %>% 
  group_by(grp) %>% 
  mutate(last_id = last(id)) %>% 
  ungroup() %>% 
  print() %>% 
  show_query()
#> # Source:     SQL [7 x 4]
#> # Database:   sqlite 3.39.4 [:memory:]
#> # Ordered by: grp, win
#>     grp win      id last_id
#>   <dbl> <chr> <dbl>   <dbl>
#> 1     1 A         4       4
#> 2     1 B         7       7
#> 3     1 C         6       6
#> 4     2 A         5       5
#> 5     2 B         1       1
#> 6     2 C         3       3
#> 7     3 C         2       2
#> <SQL>
#> SELECT
#>   *,
#>   LAST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `grp`, `win`) AS `last_id`
#> FROM `dbplyr_001`
#> ORDER BY `grp`, `win`

dbplyr::window_order()允許我們覆蓋由 group_by() 創建的 ORDER BY 子句,我試過window_order(,win) ,但沒有 cookie:

remote_data %>% 
  arrange(grp, win) %>% 
  group_by(grp) %>% 
  window_order(win) %>% 
  mutate(last_id = last(id)) %>% 
  ungroup() %>% 
  print() %>% 
  show_query()
#> # Source:     SQL [7 x 4]
#> # Database:   sqlite 3.39.4 [:memory:]
#> # Ordered by: win
#>     grp win      id last_id
#>   <dbl> <chr> <dbl>   <dbl>
#> 1     1 A         4       4
#> 2     1 B         7       7
#> 3     1 C         6       6
#> 4     2 A         5       5
#> 5     2 B         1       1
#> 6     2 C         3       3
#> 7     3 C         2       2
#> <SQL>
#> SELECT *, LAST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `win`) AS `last_id`
#> FROM `dbplyr_001`
#> ORDER BY `grp`, `win`

出於某種原因window_order(,grp)確實會觸發 window 計算,但不會觸發預期的順序:

remote_data %>% 
  arrange(grp, win) %>% 
  group_by(grp) %>% 
  window_order(grp) %>% 
  mutate(last_id = last(id)) %>% 
  ungroup() %>% 
  print() %>% 
  show_query()
#> # Source:     SQL [7 x 4]
#> # Database:   sqlite 3.39.4 [:memory:]
#> # Ordered by: grp
#>     grp win      id last_id
#>   <dbl> <chr> <dbl>   <dbl>
#> 1     1 A         4       6
#> 2     1 B         7       6
#> 3     1 C         6       6
#> 4     2 A         5       5
#> 5     2 B         1       5
#> 6     2 C         3       5
#> 7     3 C         2       2
#> <SQL>
#> SELECT *, LAST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `grp`) AS `last_id`
#> FROM `dbplyr_001`
#> ORDER BY `grp`, `win`

我該怎么做才能僅使用遠程計算(最好是 {dbplyr} 代碼)來保留我的初始 output?

雖然last()似乎被破壞,但first()似乎按預期工作,因此您可以使用order_by參數和row_number()來獲取最后一個值。 您還需要將arrange()進一步向下移動到管道中,並在其位置使用window_order()

library(dbplyr)
library(dplyr, w = F)

remote_data %>% 
  window_order(grp, win) %>% 
  group_by(grp) %>% 
  mutate(rn = row_number(),
         last_id = first(id, order_by = desc(rn))) %>% 
  arrange(grp, win, rn) %>%
  ungroup() %>% 
  select(-rn) %>%
  print() %>%
  show_query()

# Source:     SQL [7 x 4]
# Database:   sqlite 3.39.4 [:memory:]
# Ordered by: grp, win, rn
    grp win      id last_id
  <dbl> <chr> <dbl>   <dbl>
1     1 A         4       6
2     1 B         7       6
3     1 C         6       6
4     2 A         5       3
5     2 B         1       3
6     2 C         3       3
7     3 C         2       2
<SQL>
SELECT
  `grp`,
  `win`,
  `id`,
  FIRST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `rn` DESC) AS `last_id`
FROM (
  SELECT *, ROW_NUMBER() OVER (PARTITION BY `grp` ORDER BY `grp`, `win`) AS `rn`
  FROM `dbplyr_008`
)
ORDER BY `grp`, `win`, `rn`

看來您需要使用window_frame()

library(dbplyr)
library(dplyr, w = F)
remote_data <- memdb_frame(
  grp = c(2, 2, 2, 1, 3, 1, 1),
  win = c("B", "C", "A", "B", "C", "A", "C"),
  id = c(1,3,5,7,2,4,6),
)

remote_data %>% 
  group_by(grp) %>% 
  window_order(win) %>% 
  window_frame() |> 
  mutate(last_id = last(id)) %>% 
  ungroup() %>% 
  print() %>% 
  show_query()
#> # Source:     SQL [7 x 4]
#> # Database:   sqlite 3.39.4 [:memory:]
#> # Ordered by: win
#>     grp win      id last_id
#>   <dbl> <chr> <dbl>   <dbl>
#> 1     1 A         4       6
#> 2     1 B         7       6
#> 3     1 C         6       6
#> 4     2 A         5       3
#> 5     2 B         1       3
#> 6     2 C         3       3
#> 7     3 C         2       2
#> <SQL>
#> SELECT
#>   *,
#>   LAST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `win` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `last_id`
#> FROM `dbplyr_001`

創建於 2022-12-05,使用reprex v2.0.2

這對我來說就像是 dbplyr 中的一個錯誤。 你能在 dbplyr 回購中打開一個問題嗎? 然后我可以為下一個 dbplyr 版本修復這個問題。

這是使用連接的解決方法,但這不是很令人滿意,也可能效率低下:

lkp <- remote_data %>% 
  group_by(grp) %>% 
  filter(win == max(win, na.rm = TRUE)) %>% 
  ungroup() %>% 
  select(grp,  last_id = id) %>% 
  distinct()

remote_data %>% 
  left_join(lkp, by = "grp") %>% 
  arrange(grp, win) %>% 
  print() %>% 
  show_query()
#> # Source:     SQL [7 x 4]
#> # Database:   sqlite 3.39.4 [:memory:]
#> # Ordered by: grp, win
#>     grp win      id last_id
#>   <dbl> <chr> <dbl>   <dbl>
#> 1     1 A         4       6
#> 2     1 B         7       6
#> 3     1 C         6       6
#> 4     2 A         5       3
#> 5     2 B         1       3
#> 6     2 C         3       3
#> 7     3 C         2       2
#> <SQL>
#> SELECT *
#> FROM (
#>   SELECT `LHS`.`grp` AS `grp`, `win`, `id`, `last_id`
#>   FROM `dbplyr_001` AS `LHS`
#>   LEFT JOIN (
#>     SELECT DISTINCT `grp`, `id` AS `last_id`
#>     FROM (
#>       SELECT `grp`, `win`, `id`
#>       FROM (
#>         SELECT *, MAX(`win`) OVER (PARTITION BY `grp`) AS `q01`
#>         FROM `dbplyr_001`
#>       )
#>       WHERE (`win` = `q01`)
#>     )
#>   ) AS `RHS`
#>     ON (`LHS`.`grp` = `RHS`.`grp`)
#> )
#> ORDER BY `grp`, `win`

創建於 2022-12-04,使用reprex v2.0.2

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM