简体   繁体   中英

dplyr not grouping as intended in R

I have a load of data as follows

    structure(list(chr = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1", 
"10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "2", 
"20", "21", "22", "3", "4", "5", "6", "7", "8", "9", "X", "Y"
), class = "factor"), leftPos = c(1L, 15001L, 30001L, 45001L, 
60001L, 75001L, 90001L, 105001L, 120001L, 135001L, 150001L, 165001L, 
180001L, 195001L, 210001L, 225001L, 240001L, 255001L, 270001L, 
285001L, 300001L, 315001L, 330001L, 345001L, 360001L, 375001L, 
390001L, 405001L, 420001L, 435001L, 450001L, 465001L, 480001L, 
495001L, 510001L, 525001L, 540001L, 555001L, 570001L, 585001L, 
600001L, 615001L, 630001L, 645001L, 660001L, 675001L, 690001L, 
705001L, 720001L, 735001L, 750001L, 765001L, 780001L, 795001L, 
810001L, 825001L, 840001L, 855001L, 870001L, 885001L, 900001L, 
915001L, 930001L, 945001L, 960001L, 975001L, 990001L, 1005001L, 
1020001L, 1035001L), Means = c(NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
0.057, 0.162, -0.088, -0.271, 0.038, 0.089, -0.091, -0.223, 0.091, 
-0.023, -0.008, NA, -0.152, -0.228)), .Names = c("chr", "leftPos", 
"Means"), row.names = c(NA, 70L), class = "data.frame")

I want to group the data so that I get the mean of the column called Means for a grouping of leftPos at 1000000.

So I used this code:

NadSWGSv <- NadSWGSv %>%
  group_by(chr, binnum = (leftPos) %/% 1500000) %>%
  summarise(Means = mean(Means)) %>%
  mutate(leftPos = (binnum+1) * 120000) %>%
  select(leftPos, Means)

but it gives me lots of NAs where I was expecting means. I don't know why.

structure(list(chr = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1", 
"10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "2", 
"20", "21", "22", "3", "4", "5", "6", "7", "8", "9", "X", "Y"
), class = "factor"), leftPos = c(120000, 240000, 360000, 480000, 
6e+05, 720000, 840000, 960000, 1080000, 1200000, 1320000, 1440000, 
1560000, 1680000, 1800000, 1920000, 2040000, 2160000, 2280000, 
2400000, 2520000, 2640000, 2760000, 2880000, 3e+06, 3120000, 
3240000, 3360000, 3480000, 3600000, 3720000, 3840000, 3960000, 
4080000, 4200000, 4320000, 4440000, 4560000, 4680000, 4800000, 
4920000, 5040000, 5160000, 5280000, 5400000, 5520000, 5640000, 
5760000, 5880000, 6e+06, 6120000, 6240000, 6360000, 6480000, 
6600000, 6720000, 6840000, 6960000, 7080000, 7200000, 7320000, 
7440000, 7560000, 7680000, 7800000, 7920000, 8040000, 8160000, 
8280000, 8400000), Means = c(NA, NA, NA, NA, NA, NA, -0.07272, 
NA, NA, NA, NA, NA, -0.000940000000000001, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
0.00673, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, -0.11982, NA, NA, -0.10338, -0.17146, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, -0.09175, NA, NA, NA, NA)), .Names = c("chr", 
"leftPos", "Means"), class = c("grouped_df", "tbl_df", "tbl", 
"data.frame"), row.names = c(NA, -70L), vars = list(chr), drop = TRUE, indices = list(
    0:69), group_sizes = 70L, biggest_group_size = 70L, labels = structure(list(
    chr = structure(1L, .Label = c("1", "10", "11", "12", "13", 
    "14", "15", "16", "17", "18", "19", "2", "20", "21", "22", 
    "3", "4", "5", "6", "7", "8", "9", "X", "Y"), class = "factor")), class = "data.frame", row.names = c(NA, 
-1L), .Names = "chr", vars = list(chr)))

You need to use Means = mean(Means, na.rm = TRUE) in your summarise call.

The mean of any vector with an NA is NA by default. We can take the mean of those that are not NA by passing the argument na.rm = TRUE to mean:

mean(c(1, 2, NA))
[1] NA
mean(c(1, 2, NA), na.rm = TRUE)
[1] 1.5

In your case, the column Means has NAs, thus your NA.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM