简体   繁体   中英

Count by group using dplyr, but also count 0s

I'm making a count table for 2 data frames. One looks like this:

table1 <- data.table("Col1" = c("Al", "Al", "Al", "Cu", "Cu", "Cu", "Pb", "Pb", 
"Pb"), "Col2" = c("F", "UF", "P", "F", "UF", "P", "F", "UF", "P"), "Col3" = c("C", 
"UC", "<", "C", "UC", "<", "C", "UC", "<"))

table2 <- data.table("Col1" = c("Al", "Al", "Cu", "Pb", "Pb", "Pb"), "Col2" = c("F", 
"UF", "F", "F", "UF", "P"), "Col3" = c("C", "UC", "<", "C", "UC", "<"))

I'd create a count table for table1 and table2 like this:

table1 %>% group_by(Col1, Col2, Col3) %>% tally()

# A tibble: 9 x 4
# Groups:   Col1, Col2 [9]
  Col1  Col2  Col3      n
  <chr> <chr> <chr> <int>
1 Al    F     C         1
2 Al    P     <         1
3 Al    UF    UC        1
4 Cu    F     C         1
5 Cu    P     <         1
6 Cu    UF    UC        1
7 Pb    F     C         1
8 Pb    P     <         1
9 Pb    UF    UC        1

table2 %>% group_by(Col1, Col2, Col3) %>% tally()
# A tibble: 6 x 4
# Groups:   Col1, Col2 [6]
  Col1  Col2  Col3      n
  <chr> <chr> <chr> <int>
1 Al    F     C         1
2 Al    UF    UC        1
3 Cu    F     <         1
4 Pb    F     C         1
5 Pb    P     <         1
6 Pb    UF    UC        1

But I'd like the count table for table2 to have the 0 counts with the combinations from table1, not remove them completely so it doesn't show the 0 counts. Is there a way I can do this with dplyr or a different package?

Thank you for your help!

What about this?

table1 %>%
  left_join(cbind(table2, n = 1)) %>%
  group_by(Col1, Col2, Col3) %>%
  mutate(n = sum(n, na.rm = TRUE))

and we will see

  Col1  Col2  Col3      n
  <chr> <chr> <chr> <dbl>
1 Al    F     C         1
2 Al    UF    UC        1
3 Al    P     <         0
4 Cu    F     C         0
5 Cu    UF    UC        0
6 Cu    P     <         0
7 Pb    F     C         1
8 Pb    UF    UC        1
9 Pb    P     <         1

You can try complete

library(tidyverse)


table2 %>% 
  count(Col1, Col2, Col3, name = "sum") %>% 
  complete(distinct_all(table1), fill = list(sum=0))
# A tibble: 10 x 4
   Col1  Col2  Col3    sum
   <chr> <chr> <chr> <dbl>
 1 Al    F     C         1
 2 Al    P     <         0
 3 Al    UF    UC        1
 4 Cu    F     C         0
 5 Cu    P     <         0
 6 Cu    UF    UC        0
 7 Pb    F     C         1
 8 Pb    P     <         1
 9 Pb    UF    UC        1
10 Cu    F     <         1

Or a full_join

table2 %>% 
  count(Col1, Col2, Col3, name = "sum") %>% 
  full_join(distinct_all(table1)) %>% 
  mutate(sum=replace_na(sum, 0))

1) Append an n=1 column to table2 and an n=0 column to table 1 and then sum n by group.

table2 %>% 
  mutate(n = 1L) %>%
  bind_rows(table1 %>% mutate(n = 0L)) %>%
  group_by(Col1, Col2, Col3) %>%
  summarize(n = sum(n), .groups = "drop")

giving:

# A tibble: 10 x 4
   Col1  Col2  Col3      n
   <chr> <chr> <chr> <int>
 1 Al    F     C         1
 2 Al    P     <         0
 3 Al    UF    UC        1
 4 Cu    F     <         1
 5 Cu    F     C         0
 6 Cu    P     <         0
 7 Cu    UF    UC        0
 8 Pb    F     C         1
 9 Pb    P     <         1
10 Pb    UF    UC        1

2) This variation gives the same result.

list(table1, table2) %>%
  bind_rows(.id = "id") %>% 
  group_by(Col1, Col2, Col3) %>%
  summarize(n = sum(id == 2L), .groups = "drop")

3) This is a data.table only solution.

rbindlist(list(table1, table2), idcol = TRUE)[, 
  .(n = sum(.id == 2L)), by = .(Col1, Col2, Col3)]

4) This is a base R solution.

both <- rbind(transform(table1, n = 0), transform(table2, n = 1))
aggregate(n ~., both, sum)

5) This uses SQL.

library(sqldf)

sqldf("with both as (
    select *, 0 as n from table1
    union all
    select *, 1 as n from table2
  )
  select Col1, Col2, Col3, sum(n) as n 
    from both 
    group by Col1, Col2, Col3
")

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM