简体   繁体   中英

Count unique values by group (data.table) doesn't work

I'm trying to count the unique values from a variable ( ruc_pk_informante ) by two groups ( year and pn_soc_informante ) with data.table but I get different values from dplyr (I used tidytable ).

data.table

library(data.table)
library(tidytable)

> tt[, .(count = uniqueN(ruc_pk_informante)), by = .(sort(year),pn_soc_informante)]
# A tidytable: 4 x 3
   sort pn_soc_informante count
  <int> <chr>             <int>
1  2012 PERSONA NATURAL       1
2  2013 PERSONA NATURAL      10 --------->????
3  2014 PERSONA NATURAL       8
4  2015 PERSONA NATURAL       1

dplyr and manually

> tt %>% arrange.(year) %>% summarise.(n_distinct.(ruc_pk_informante), .by = c(year, pn_soc_informante))
# A tidytable: 4 x 3
   year pn_soc_informante    V1
  <int> <chr>             <int>
1  2012 PERSONA NATURAL       1
2  2013 PERSONA NATURAL      17 --------> OK!
3  2014 PERSONA NATURAL      12
4  2015 PERSONA NATURAL      13
> t1 = tt %>% filter.(year == 2013)
> length(unique(t1$ruc_pk_informante)) # OK!
[1] 17 

Data

# data ---------------------------------------------------------------------


tt = structure(list(ruc_pk_informante = c("R3", "R3", "R3", "R3", 
                                     "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", 
                                     "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", 
                                     "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", 
                                     "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", 
                                     "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", "R3", 
                                     "R3", "R3", "R3", "R3", "R3", "R5", "R5", "R7", "R7", "R7", "R7", 
                                     "C9", "C9", "R12", "R12", "R12", "R14", "R16", "R16", "R16", 
                                     "R16", "R16", "R16", "R16", "R16", "R16", "R16", "R16", "R16", 
                                     "R16", "R16", "R16", "R16", "R16", "R16", "R16", "R16", "R16", 
                                     "C21", "R23", "R23", "R23", "R23", "R23", "R23", "R23", "R23", 
                                     "R23", "R27", "R27", "R27", "R27", "R27", "R27", "R27", "R27", 
                                     "R27", "R27", "R27", "R27", "R27", "R27", "R27", "R27", "R27", 
                                     "R27", "R27", "R27", "R27", "R27", "R27", "R30", "R30", "R30", 
                                     "R30", "R33", "R33", "R33", "R33", "R33", "R33", "R34", "R34", 
                                     "R34", "R34", "R37", "R41", "R41", "R41", "R41", "R41", "R41", 
                                     "R41", "R44", "R44", "R44", "R45", "R45", "R45", "R45", "R45", 
                                     "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", 
                                     "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", 
                                     "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", 
                                     "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", 
                                     "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", 
                                     "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", 
                                     "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", 
                                     "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", 
                                     "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", 
                                     "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45", "R45"
), year = c(2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 
            2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 
            2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2014L, 2014L, 2014L, 
            2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 
            2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 
            2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2015L, 2015L, 
            2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 
            2015L, 2015L, 2015L, 2013L, 2015L, 2012L, 2013L, 2014L, 2015L, 
            2013L, 2015L, 2013L, 2014L, 2015L, 2013L, 2013L, 2013L, 2013L, 
            2013L, 2013L, 2013L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 
            2014L, 2014L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 
            2013L, 2013L, 2013L, 2013L, 2014L, 2014L, 2014L, 2015L, 2015L, 
            2015L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 
            2013L, 2014L, 2014L, 2014L, 2014L, 2014L, 2015L, 2015L, 2015L, 
            2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2013L, 2014L, 2015L, 
            2015L, 2013L, 2013L, 2014L, 2014L, 2015L, 2015L, 2013L, 2013L, 
            2014L, 2015L, 2013L, 2013L, 2013L, 2014L, 2014L, 2015L, 2015L, 
            2015L, 2013L, 2014L, 2015L, 2013L, 2013L, 2013L, 2013L, 2013L, 
            2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 
            2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 
            2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 
            2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 
            2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 
            2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2014L, 
            2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 
            2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 
            2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 
            2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L
), pn_soc_informante = c("PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", 
                         "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL", "PERSONA NATURAL"
)), row.names = c(NA, -250L), class = c("tidytable", "data.table", 
                                        "data.frame"), index = structure(integer(0), "`__year`" = c(67L, 
                                                                                                                                                       1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 
                                                                                                                                                       15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 65L, 68L, 71L, 73L, 76L, 
                                                                                                                                                       77L, 78L, 79L, 80L, 81L, 82L, 98L, 99L, 100L, 101L, 108L, 109L, 
                                                                                                                                                       110L, 111L, 112L, 113L, 114L, 115L, 116L, 131L, 135L, 136L, 141L, 
                                                                                                                                                       142L, 145L, 146L, 147L, 153L, 156L, 157L, 158L, 159L, 160L, 161L, 
                                                                                                                                                       162L, 163L, 164L, 165L, 166L, 167L, 168L, 169L, 170L, 171L, 172L, 
                                                                                                                                                       173L, 174L, 175L, 176L, 177L, 178L, 179L, 180L, 181L, 182L, 183L, 
                                                                                                                                                       184L, 185L, 186L, 187L, 188L, 189L, 190L, 191L, 192L, 193L, 194L, 
                                                                                                                                                       195L, 196L, 197L, 198L, 199L, 200L, 201L, 202L, 203L, 204L, 205L, 
                                                                                                                                                       206L, 207L, 208L, 209L, 210L, 211L, 212L, 213L, 23L, 24L, 25L, 
                                                                                                                                                       26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 
                                                                                                                                                       39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 69L, 
                                                                                                                                                       74L, 83L, 84L, 85L, 86L, 87L, 88L, 89L, 90L, 102L, 103L, 104L, 
                                                                                                                                                       117L, 118L, 119L, 120L, 121L, 132L, 137L, 138L, 143L, 148L, 149L, 
                                                                                                                                                       154L, 214L, 215L, 216L, 217L, 218L, 219L, 220L, 221L, 222L, 223L, 
                                                                                                                                                       224L, 225L, 226L, 227L, 228L, 229L, 230L, 231L, 232L, 233L, 234L, 
                                                                                                                                                       235L, 236L, 237L, 238L, 239L, 240L, 241L, 242L, 243L, 244L, 245L, 
                                                                                                                                                       246L, 247L, 248L, 249L, 250L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 
                                                                                                                                                       58L, 59L, 60L, 61L, 62L, 63L, 64L, 66L, 70L, 72L, 75L, 91L, 92L, 
                                                                                                                                                       93L, 94L, 95L, 96L, 97L, 105L, 106L, 107L, 122L, 123L, 124L, 
                                                                                                                                                       125L, 126L, 127L, 128L, 129L, 130L, 133L, 134L, 139L, 140L, 144L, 
                                                                                                                                                       150L, 151L, 152L, 155L)))


The sort on the by is breaking the order for the rows - ie it is rearranging the year separately and this would result in erroneous values from other columns matching with the sorted column of 'year'. It should be either on the i

tt[order(year), .(count = uniqueN(ruc_pk_informante)), 
     by = .(year, pn_soc_informante)]
# A tidytable: 4 × 3
   year pn_soc_informante count
  <int> <chr>             <int>
1  2012 PERSONA NATURAL       1
2  2013 PERSONA NATURAL      17
3  2014 PERSONA NATURAL      12
4  2015 PERSONA NATURAL      13

or do the order later

tt[, .(count = uniqueN(ruc_pk_informante)), 
   by = .(year, pn_soc_informante)][order(year)]
# A tidytable: 4 × 3
   year pn_soc_informante count
  <int> <chr>             <int>
1  2012 PERSONA NATURAL       1
2  2013 PERSONA NATURAL      17
3  2014 PERSONA NATURAL      12
4  2015 PERSONA NATURAL      13

Check the output from the sorted and unsorted

> tt[, .SD, by = .(year = year,pn_soc_informante)][year == 2013]$ruc_pk_informante
  [1] "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3" 
 [23] "R5"  "R7"  "C9"  "R12" "R14" "R16" "R16" "R16" "R16" "R16" "R16" "C21" "R23" "R23" "R23" "R27" "R27" "R27" "R27" "R27" "R27" "R27"
 [45] "R27" "R27" "R30" "R33" "R33" "R34" "R34" "R37" "R41" "R41" "R44" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45"
 [67] "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45"
 [89] "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45" "R45"
[111] "R45" "R45" "R45"
> tt[, .SD, by = .(year = sort(year),pn_soc_informante)][year == 2013]$ruc_pk_informante
  [1] "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3" 
 [23] "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3" 
 [45] "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R3"  "R5"  "R5"  "R7" 
 [67] "R7"  "R7"  "R7"  "C9"  "C9"  "R12" "R12" "R12" "R14" "R16" "R16" "R16" "R16" "R16" "R16" "R16" "R16" "R16" "R16" "R16" "R16" "R16"
 [89] "R16" "R16" "R16" "R16" "R16" "R16" "R16" "R16" "C21" "R23" "R23" "R23" "R23" "R23" "R23" "R23" "R23" "R23" "R27" "R27" "R27" "R27"
[111] "R27" "R27" "R27"

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM