簡體   English   中英

使用dplyr標記和計數值之間的差距

[英]Using dplyr to label and count gaps between values

我有這個數據框:

    df<-structure(list(Name = c("sub1", "sub1", "sub1", "sub1", "sub1", 
                            "sub1", "sub1", "sub1", "sub1", "sub1", "sub1", "sub1", "sub1", 
                            "sub1", "sub1", "sub1", "sub1", "sub1", "sub1", "sub1", "sub1", 
                            "sub1", "sub1", "sub2", "sub2", "sub2", "sub2", "sub2", "sub2"
), StimulusName = c("Alpha11", "Alpha11", "Alpha11", "Alpha11", 
                    "Alpha11", "Alpha11", "Alpha11", "Alpha11", "Alpha11", "Alpha11", 
                    "Alpha11", "Alpha11", "Alpha11", "Alpha11", "Alpha11", "Alpha11", 
                    "Alpha11", "Alpha11", "Alpha12", "Alpha12", "Alpha12", "Alpha12", 
                    "Alpha12", "Alpha11", "Alpha11", "Alpha11", "Alpha11", "Alpha11", 
                    "Alpha11"), FixationSeq = c(2L, 2L, 2L, 2L, NA, NA, NA, NA, 3L, 
                                                3L, 3L, 3L, 3L, NA, NA, NA, NA, NA, 1L, NA, NA, 2L, NA, NA, NA, 
                                                NA, NA, 2L, 2L)), row.names = c(NA, -29L), class = c("tbl_df", 
                                                                                                     "tbl", "data.frame"), spec = structure(list(cols = list(Name = structure(list(), class = c("collector_character", 
                                                                                                                                                                                                "collector")), StimulusName = structure(list(), class = c("collector_character", 
                                                                                                                                                                                                                                                          "collector")), FixationSeq = structure(list(), class = c("collector_integer", 
                                                                                                                                                                                                                                                                                                                   "collector"))), default = structure(list(), class = c("collector_guess", 
                                                                                                                                                                                                                                                                                                                                                                         "collector"))), class = "col_spec"))

FixationSeq列中有唯一編號(在我的示例2和3中, Name = sub1StimulusName = Alpha11 )。 在這些數字之間NA填充的段。 3之后還有一個用NA填充的段。

我希望能夠創建一個新列SaccadeCount並將一個遞增的數字標簽添加到NA 段的每個實例(作為一個整體,即可能是多行)到SaccadeCount的相關行。

另外,我想再有一個名為SaccadeDuration的列,並SaccadeDuration出現NA唯一段的總行數。 因此在示例df ,與2和3之間的NA段相對應的行將填充為'3',因為那是2和3之間的行的總數。

我想使用dplyr完成此操作,並按NameStimulusName列對操作進行分組。

輸出可能看起來像這樣:

    Name    StimulusName    FixationSeq SaccadeCount    SaccadeDuration
   sub1     Alpha11             2       
   sub1     Alpha11             2       
   sub1     Alpha11             2       
   sub1     Alpha11             2       
   sub1     Alpha11             NA            1              3
   sub1     Alpha11             NA            1              3
   sub1     Alpha11             NA            1              3
   sub1     Alpha11             3       
   sub1     Alpha11             3       
   sub1     Alpha11             3       
   sub1     Alpha11             3       
   sub1     Alpha11             3       
   sub1     Alpha11             3       
   sub1     Alpha11             NA            2              5
   sub1     Alpha11             NA            2              5
   sub1     Alpha11             NA            2              5
   sub1     Alpha11             NA            2              5
   sub1     Alpha11             NA            2              5
   sub1     Alpha12             1       
   sub1     Alpha12             NA            1              2      
   sub1     Alpha12             NA            1              2
   sub1     Alpha12             2
   sub1     Alpha12             NA            2              1  
   sub2     Alpha11             NA            1              4
   sub2     Alpha11             NA            1              4
   sub2     Alpha11             NA            1              4
   sub2     Alpha11             NA            1              4
   sub2     Alpha11             2                  
   sub2     Alpha11             2 

非常感謝您的時間和幫助。

使用data.table

碼:

library(data.table)
fun1 <- function(x) {
    na.ind = is.na(x$FixationSeq)
    na.vals= rleidv(rleidv(na.ind)[na.ind])
    x$SaccadeCount = NA
    x$SaccadeCount[na.ind] = na.vals

    na.rle = rle(na.vals)
    x$SaccadeDuration = NA
    x$SaccadeDuration[na.ind] = rep(na.rle$lengths, na.rle$lengths)

    return(x)
    }

setDT(df)[, fun1(.SD) ,by = .(Name, StimulusName)]

您可以以dplyr方式使用fun1

ans<-
df %>% group_by(Name, StimulusName) %>% dplyr::do(.data = ., fun1(.))

結果:

 #   Name StimulusName FixationSeq SaccadeCount SaccadeDuration
 #1: sub1      Alpha11           2           NA              NA
 #2: sub1      Alpha11           2           NA              NA
 #3: sub1      Alpha11           2           NA              NA
 #4: sub1      Alpha11           2           NA              NA
 #5: sub1      Alpha11           2           NA              NA
 #6: sub1      Alpha11           2           NA              NA
 #7: sub1      Alpha11           2           NA              NA
 #8: sub1      Alpha11           2           NA              NA
 #9: sub1      Alpha11           2           NA              NA
#10: sub1      Alpha11           2           NA              NA
#11: sub1      Alpha11           2           NA              NA
#12: sub1      Alpha11           2           NA              NA
#13: sub1      Alpha11           2           NA              NA
#14: sub1      Alpha11           2           NA              NA
#15: sub1      Alpha11           2           NA              NA
#16: sub1      Alpha11           2           NA              NA
#17: sub1      Alpha11           2           NA              NA
#18: sub1      Alpha11           2           NA              NA
#19: sub1      Alpha11           2           NA              NA
#20: sub1      Alpha11           2           NA              NA
#21: sub1      Alpha11           2           NA              NA
#22: sub1      Alpha11          NA            1               5
#23: sub1      Alpha11          NA            1               5
#24: sub1      Alpha11          NA            1               5
#25: sub1      Alpha11          NA            1               5
#26: sub1      Alpha11          NA            1               5
#27: sub1       Alpha1           9           NA              NA
#28: sub1       Alpha1           9           NA              NA
#29: sub1       Alpha1           9           NA              NA
#30: sub1       Alpha1           9           NA              NA
#31: sub1       Alpha1           9           NA              NA
#32: sub1       Alpha1           9           NA              NA
#33: sub1       Alpha1           9           NA              NA
#    Name StimulusName FixationSeq SaccadeCount SaccadeDuration

  • 我的方法使用預定義的功能fun1來為每個組完成工作。
  • 這些組似乎定義為我的NameStimulusName
  • 我用的,你應該了解非常重要的功能?rle?rleidv
  • 我用所有NA值預填充新列,然后在需要的地方添加新值。

這應該做。 不過,也許有一種更簡單的方法。 第一個突變指示NA片段的開始。 group_by和第二個突變計數每個段的NA

library(dplyr)
df %>% mutate(SaccadeCount = cumsum(ifelse(is.na(FixationSeq) & 
              !is.na(lag(FixationSeq)), 1,0)) * is.na(FixationSeq)) %>%
    group_by(SaccadeCount) %>%
    mutate(SaccadeDuration = n()) %>%
    ungroup() %>%
    mutate(SaccadeDuration = SaccadeDuration * is.na(FixationSeq))

使用dplyr

df %>%
  group_by(Name, StimulusName) %>%
  mutate(x = is.na(FixationSeq),
         count = cumsum(c(TRUE, diff(x) != 0L) & x) * x,
         dur = NA_integer_) %>%
  group_by(Name, StimulusName, count) %>%
  mutate(dur = replace(dur, as.logical(count), n()))

對應的(更data.tabledata.table版本:

library(data.table)
setDT(df)

df[ , count := ({
  x <- is.na(FixationSeq)
  .(cumsum(c(TRUE, diff(x) != 0L) & x) * x)}), by = .(Name, StimulusName)]

df[as.logical(count), dur := .N, by = .(Name, StimulusName, count)]
  Name StimulusName FixationSeq count dur 1: sub1 Alpha11 2 0 NA 2: sub1 Alpha11 2 0 NA 3: sub1 Alpha11 2 0 NA 4: sub1 Alpha11 2 0 NA 5: sub1 Alpha11 NA 1 4 6: sub1 Alpha11 NA 1 4 7: sub1 Alpha11 NA 1 4 8: sub1 Alpha11 NA 1 4 9: sub1 Alpha11 3 0 NA 10: sub1 Alpha11 3 0 NA 11: sub1 Alpha11 3 0 NA 12: sub1 Alpha11 3 0 NA 13: sub1 Alpha11 3 0 NA 14: sub1 Alpha11 NA 2 5 15: sub1 Alpha11 NA 2 5 16: sub1 Alpha11 NA 2 5 17: sub1 Alpha11 NA 2 5 18: sub1 Alpha11 NA 2 5 19: sub1 Alpha12 1 0 NA 20: sub1 Alpha12 NA 1 2 21: sub1 Alpha12 NA 1 2 22: sub1 Alpha12 2 0 NA 23: sub1 Alpha12 NA 2 1 24: sub2 Alpha11 NA 1 4 25: sub2 Alpha11 NA 1 4 26: sub2 Alpha11 NA 1 4 27: sub2 Alpha11 NA 1 4 28: sub2 Alpha11 2 0 NA 29: sub2 Alpha11 2 0 NA Name StimulusName FixationSeq count dur 

如果需要,將count == 0更改為NA

df[count == 0, count := NA]

如問題所示,我不會將其更改為'blank'( "" ),因為這將迫使該列具有character ,並使這些數字無法用於進一步的分析。


cumsum(c(TRUE, diff(x) != 0L) & x) * x逐步說明:

v <- c(1, 1, NA, NA, 1, NA, NA, NA)
x <- is.na(v)
x
# [1] FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE

diff(x)
# [1]  0  1  0 -1  1  0  0

diff(x) != 0L
# [1] FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE

c(TRUE, diff(x) != 0L) & x
# [1] FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE

cumsum(c(TRUE, diff(x) != 0L) & x)
# [1] 0 0 1 1 1 2 2 2

cumsum(c(TRUE, diff(x) != 0L) & x) * x
# [1] 0 0 1 1 0 2 2 2

其余的希望相當簡單。

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM