簡體   English   中英

如何計算 R dplyr 中不同觀察值的總和

[英]How to calculate the sum of distinct observations in R dplyr

我很納悶。 雖然我知道如何使用 n_distinct() 計算每組 distinct_values 的總和,但目前要找到唯一觀察值的唯一總和對我來說似乎具有挑戰性。

我想按 id 分組,然后對每個唯一值求和......

library(tidyverse)

df <- tibble(col1 = c("apple","apple","pple", "banana", "banana","bananna"),
             col2 = c("pple","app","app", "bananna", "banan", "banan"), 
             counts_col1 = c(100,100,2,200,200,2),
             counts_col2 = c(2,50,50,2,20,20),
             id=c(1,1,1,2,2,2))

df1 <- df %>% 
  pivot_longer(c(counts_col1:counts_col2),names_to ="strings",values_to = "value") %>% 
  group_by(id,col1,col2) %>% 
  ungroup() %>% 
  group_by(id) 

df1
#> # A tibble: 12 × 5
#> # Groups:   id [2]
#>    col1    col2       id strings     value
#>    <chr>   <chr>   <dbl> <chr>       <dbl>
#>  1 apple   pple        1 counts_col1   100
#>  2 apple   pple        1 counts_col2     2
#>  3 apple   app         1 counts_col1   100
#>  4 apple   app         1 counts_col2    50
#>  5 pple    app         1 counts_col1     2
#>  6 pple    app         1 counts_col2    50
#>  7 banana  bananna     2 counts_col1   200
#>  8 banana  bananna     2 counts_col2     2
#>  9 banana  banan       2 counts_col1   200
#> 10 banana  banan       2 counts_col2    20
#> 11 bananna banan       2 counts_col1     2
#> 12 bananna banan       2 counts_col2    20

reprex package (v2.0.1) 創建於 2022-03-16

...最終會變成這樣


#>    col1    col2       id strings     value    sum_distinct
#>    <chr>   <chr>   <dbl> <chr>       <dbl>      
#>  1 apple   pple        1 counts_col1   100    152
#>  2 apple   pple        1 counts_col2     2    NA
#>  3 apple   app         1 counts_col1   100    NA
#>  4 apple   app         1 counts_col2    50    NA
#>  5 pple    app         1 counts_col1     2    NA
#>  6 pple    app         1 counts_col2    50    NA
#>  7 banana  bananna     2 counts_col1   200    222
#>  8 banana  bananna     2 counts_col2     2    NA
#>  9 banana  banan       2 counts_col1   200    NA
#> 10 banana  banan       2 counts_col2    20    NA
#> 11 bananna banan       2 counts_col1     2    NA
#> 12 bananna banan       2 counts_col2    20    NA

我們可以使用replace with unique

library(dplyr)
library(tidyr)
df %>% 
  pivot_longer(c(counts_col1:counts_col2), 
      names_to ="strings",values_to = "value") %>% 
  group_by(id,col1,col2) %>%    
  group_by(id) %>%
  mutate(sum_distinct = replace(rep(NA_real_, n()), 1, sum(unique(value)))) %>%
  ungroup

-輸出

# A tibble: 12 × 6
   col1    col2       id strings     value sum_distinct
   <chr>   <chr>   <dbl> <chr>       <dbl>        <dbl>
 1 apple   pple        1 counts_col1   100          152
 2 apple   pple        1 counts_col2     2           NA
 3 apple   app         1 counts_col1   100           NA
 4 apple   app         1 counts_col2    50           NA
 5 pple    app         1 counts_col1     2           NA
 6 pple    app         1 counts_col2    50           NA
 7 banana  bananna     2 counts_col1   200          222
 8 banana  bananna     2 counts_col2     2           NA
 9 banana  banan       2 counts_col1   200           NA
10 banana  banan       2 counts_col2    20           NA
11 bananna banan       2 counts_col1     2           NA
12 bananna banan       2 counts_col2    20           NA

使用data.table ,你可以這樣做:

代表

  • 代碼
library(tidyverse) # to read your tibble
library(data.table)

# 1 - Building your starting data.table
df <- melt(setDT(df),
  id.vars = c("col1", "col2", "id"),
  measure.vars = c("counts_col1", "counts_col2"),
  variable.name = "string")[order(id, col1, -col2)]

df
#>        col1    col2 id      string value
#>  1:   apple    pple  1 counts_col1   100
#>  2:   apple    pple  1 counts_col2     2
#>  3:   apple     app  1 counts_col1   100
#>  4:   apple     app  1 counts_col2    50
#>  5:    pple     app  1 counts_col1     2
#>  6:    pple     app  1 counts_col2    50
#>  7:  banana bananna  2 counts_col1   200
#>  8:  banana bananna  2 counts_col2     2
#>  9:  banana   banan  2 counts_col1   200
#> 10:  banana   banan  2 counts_col2    20
#> 11: bananna   banan  2 counts_col1     2
#> 12: bananna   banan  2 counts_col2    20

# 2 - Computing the 'sum_distinct' column
df[, sum_distinct := sum(unique(value)), by = id
   ][!df[, .I[1], by = id]$V1, sum_distinct := NA_integer_][]
  • Output
#>        col1    col2    id      string value sum_distinct
#>      <char>  <char> <num>      <fctr> <num>        <num>
#>  1:   apple    pple     1 counts_col1   100          152
#>  2:   apple    pple     1 counts_col2     2           NA
#>  3:   apple     app     1 counts_col1   100           NA
#>  4:   apple     app     1 counts_col2    50           NA
#>  5:    pple     app     1 counts_col1     2           NA
#>  6:    pple     app     1 counts_col2    50           NA
#>  7:  banana bananna     2 counts_col1   200          222
#>  8:  banana bananna     2 counts_col2     2           NA
#>  9:  banana   banan     2 counts_col1   200           NA
#> 10:  banana   banan     2 counts_col2    20           NA
#> 11: bananna   banan     2 counts_col1     2           NA
#> 12: bananna   banan     2 counts_col2    20           NA

reprex package (v2.0.1) 創建於 2022-03-16

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM