簡體   English   中英

dplyr::count() 多列

[英]dplyr::count() multiple columns

我有以下數據集:

dat = structure(list(C86_1981 = c("Outer London", "Buckinghamshire", 
NA, "Ross and Cromarty", "Cornwall and Isles of Scilly", NA, 
"Kirkcaldy", "Devon", "Kent", "Renfrew"), C96_1981 = c("Outer London", 
"Buckinghamshire", NA, "Ross and Cromarty", "Not known/missing", 
NA, "Kirkcaldy", NA, NA, NA), C00_1981 = c("Outer London", "Inner London", 
"Lancashire", "Ross and Cromarty", NA, "Humberside", "Kirkcaldy", 
NA, NA, NA), C04_1981 = c("Kent", NA, NA, "Ross and Cromarty", 
NA, "Humberside", "Not known/missing", NA, NA, "Renfrew"), C08_1981 = c("Kent", 
"Oxfordshire", NA, "Ross and Cromarty", "Cornwall and Isles of Scilly", 
"Humberside", "Dunfermline", NA, NA, "Renfrew"), C12_1981 = c("Kent", 
NA, NA, "Ross and Cromarty", "Cornwall and Isles of Scilly", 
"Humberside", "Dunfermline", NA, NA, "Renfrew")), row.names = c(NA, 
-10L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("C86_1981", 
"C96_1981", "C00_1981", "C04_1981", "C08_1981", "C12_1981"))

我想dplyr::count()每列。 預期輸出:

# A tibble: 10 x 3
                       C86_1981 dat86_n dat96_n ...
                          <chr>   <int>   <int>
 1              Buckinghamshire       1       1
 2 Cornwall and Isles of Scilly       1      NA
 3                        Devon       1      NA
 4                         Kent       1      NA
 5                    Kirkcaldy       1       1
 6                 Outer London       1       1
 7                      Renfrew       1      NA
 8            Ross and Cromarty       1       1
 9                         <NA>       2       5
10            Not known/missing      NA       1

目前我正在手動執行此操作,然后dplyr::full_join()結果:

library("tidyverse")

dat86_n = dat %>%
  count(C86_1981) %>%
  rename(dat86_n = n)
dat96_n = dat %>%
  count(C96_1981) %>%
  rename(dat96_n = n)
# ...

dat_counts = dat86_n %>%
  full_join(dat96_n, by = c("C86_1981" = "C96_1981"))
  # ...

哪個有效,但如果我的任何數據稍后發生更改,則它並不完全可靠。 我曾希望以編程方式執行此操作。

我試過一個循環:

lapply(dat, count)
# Error in UseMethod("groups") : 
# no applicable method for 'groups' applied to an object of class "character"

purrr::map()給出了同樣的錯誤)。 我認為這個錯誤是因為count()需要一個tbl和一個變量作為單獨的參數,所以我也嘗試過:

lapply(dat, function(x) {
  count(dat, x)
})
# Error in grouped_df_impl(data, unname(vars), drop) : 
# Column `x` is unknown

同樣, purrr::map()給出了同樣的錯誤。 我也嘗試過summarise_all()變體:

dat %>% 
  summarise_all(count)
  # Error in summarise_impl(.data, dots) : 
  # Evaluation error: no applicable method for 'groups' applied to an object of class "character".

我覺得我錯過了一些明顯的東西,解決方案應該很簡單。 dplyr解決方案特別受歡迎,因為這是我最dplyr解決方案。

還使用 tidyr 包,下面的代碼可以解決問題:

dat %>% tidyr::gather(name, city) %>% dplyr::group_by(name, city) %>% dplyr::count() %>% dplyr::ungroup %>% tidyr::spread(name, n)

結果:

# A tibble: 15 x 7
                           city C00_1981 C04_1981 C08_1981 C12_1981 C86_1981 C96_1981
 *                        <chr>    <int>    <int>    <int>    <int>    <int>    <int>
 1              Buckinghamshire       NA       NA       NA       NA        1        1
 2 Cornwall and Isles of Scilly       NA       NA        1        1        1       NA
 3                        Devon       NA       NA       NA       NA        1       NA
 4                  Dunfermline       NA       NA        1        1       NA       NA
 5                   Humberside        1        1        1        1       NA       NA
 6                 Inner London        1       NA       NA       NA       NA       NA
 7                         Kent       NA        1        1        1        1       NA
 8                    Kirkcaldy        1       NA       NA       NA        1        1
 9                   Lancashire        1       NA       NA       NA       NA       NA
10            Not known/missing       NA        1       NA       NA       NA        1
11                 Outer London        1       NA       NA       NA        1        1
12                  Oxfordshire       NA       NA        1       NA       NA       NA
13                      Renfrew       NA        1        1        1        1       NA
14            Ross and Cromarty        1        1        1        1        1        1
15                         <NA>        4        5        3        4        2        5

@You-leee 打敗了我 ;)

使用 tidyverse;

library(tidyverse)

df <- 
  dat %>% 
  gather (year, county) %>% 
  group_by(year, county) %>% 
  summarise(no = n()) %>% 
  spread (year, no)

# A tibble: 15 x 7
                         county C00_1981 C04_1981 C08_1981 C12_1981 C86_1981 C96_1981
 *                        <chr>    <int>    <int>    <int>    <int>    <int>    <int>
 1              Buckinghamshire       NA       NA       NA       NA        1        1
 2 Cornwall and Isles of Scilly       NA       NA        1        1        1       NA
 3                        Devon       NA       NA       NA       NA        1       NA
 4                  Dunfermline       NA       NA        1        1       NA       NA
 5                   Humberside        1        1        1        1       NA       NA  
 6                 Inner London        1       NA       NA       NA       NA       NA
 7                         Kent       NA        1        1        1        1       NA
 8                    Kirkcaldy        1       NA       NA       NA        1        1
 9                   Lancashire        1       NA       NA       NA       NA       NA
10            Not known/missing       NA        1       NA       NA       NA        1
11                 Outer London        1       NA       NA       NA        1        1
12                  Oxfordshire       NA       NA        1       NA       NA       NA
13                      Renfrew       NA        1        1        1        1       NA
14            Ross and Cromarty        1        1        1        1        1        1
15                         <NA>        4        5        3        4        2        5

以前的帶有 collect gather +count+spread答案效果很好,但不適用於非常大的數據集(大組或許多變量)。 這是一種替代方法,使用map-count + join ,在非常大的數據上,它似乎快了 2 倍:

library(tidyverse)
N <-  1000000
df <- tibble(x1=sample(letters, N, replace = TRUE),
             x2=sample(letters, N, replace = TRUE),
             x3=sample(letters, N, replace = TRUE),
             x4=sample(letters, N, replace = TRUE),
             x5=sample(letters, N, replace = TRUE))


res1 <- map(c("x1", "x2", "x3", "x4", "x5"), function(x) select_at(df, x) %>%  count(!!rlang::sym(x)) %>% 
         rename(value=!!rlang::sym(x),
                !!rlang::sym(x):=n)) %>% 
  reduce(full_join, by = "value")

res2 <- df %>% 
  tidyr::gather(variable, value) %>% 
  dplyr::group_by(variable, value) %>%
  dplyr::count() %>% dplyr::ungroup()%>%
  tidyr::spread(variable, n)

all.equal(res1, res2)
#> [1] TRUE

library(microbenchmark)
microbenchmark(s1=map(c("x1", "x2", "x3", "x4", "x5"), function(x) select_at(df, x) %>%  count(!!rlang::sym(x)) %>% 
                     rename(value=!!rlang::sym(x),
                            !!rlang::sym(x):=n)) %>% 
                 reduce(full_join, by = "value"),
               s2= df %>% 
                 tidyr::gather(variable, value) %>% 
                 dplyr::group_by(variable, value) %>%
                 dplyr::count() %>% dplyr::ungroup()%>%
                 tidyr::spread(variable, n),
               times = 50, check = "equal")
#> Unit: milliseconds
#>  expr      min       lq     mean   median       uq      max neval
#>    s1 214.9027 220.2292 241.8811 229.0913 242.2507 368.5147    50
#>    s2 412.8934 447.5347 515.2612 528.0221 561.7649 692.5999    50

reprex 包(v0.3.0) 於 2020 年 5 月 19 日創建

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM