简体   繁体   中英

R Replace values based on conditions (for same ID) without using for-loop

I have a df similar to this one but much bigger (100.000 rows x 100 columns)

df <-data.frame(id=c("1","2","2","3","4","4", "4", "4", "4", "4", "5"), date = c("2015-01-15", "2004-03-01", "2017-03-15", "2000-01-15", "2006-05-08", "2008-05-09", "2014-05-11", "2014-06-11", "2014-07-11", "2014-08-11", "2015-12-19"), A =c (0,1,1,0,1,1,0,0,1,1,1), B=c(1,0,1,0,1,0,0,0,1,1,1), C = c(0,1,0,0,0,1,1,1,1,1,0), D = c(0,0,0,1,1,1,1,0,1,0,1), E = c(1,1,1,0,0,0,0,0,1,1,1), A.1 = c(0,0,0,0,0,0,0,0,0,0,0), B.1 = c(0,0,0,0,0,0,0,0,0,0,0), C.1 = c(0,0,0,0,0,0,0,0,0,0,0), D.1 = c(0,0,0,0,0,0,0,0,0,0,0), E.1 = c(0,0,0,0,0,0,0,0,0,0,0), acumulativediff = c(0, 0, 4762, 0, 0, 732, 2925, 2956, 2986, 3017, 0))

What I have to accomplish is this:

structure(list(id = structure(c(1L, 2L, 2L, 3L, 4L, 4L, 4L, 4L, 4L, 4L,5L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), date = structure(c(9L, 2L, 11L, 1L, 3L, 4L, 5L, 6L, 7L, 8L,10L), .Label = c("2000-01-15", "2004-03-01", "2006-05-08","2008-05-09", "2014-05-11", "2014-06-11", "2014-07-11", "2014-08-11","2015-01-15", "2015-12-19", "2017-03-15"), class = "factor"), A = c(0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1), B = c(1, 0, 1, 0,1, 0, 0, 0, 1, 1, 1), C = c(0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0), D = c(0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1), E = c(1, 1, 1,0, 0, 0, 0, 0, 1, 1, 1), A.1 = c(0, 0, 4762, 0, 0, 732, 2925,0, 0, 3017, 0), B.1 = c(0, 0, 0, 0, 0, 732, 0, 0, 0, 3017,0), C.1 = c(0, 0, 4762, 0, 0, 0, 2925, 2956, 2986, 3017,
0), D.1 = c(0, 0, 0, 0, 0, 732, 2925, 2956, 0, 3017, 0),E.1 = c(0, 0, 4762, 0, 0, 0, 0, 0, 0, 3017, 0), acumulativediff = c(0, 0, 4762, 0, 0, 732, 2925, 2956, 2986, 3017, 0)), .Names = c("id","date", "A", "B", "C", "D", "E", "A.1", "B.1", "C.1", "D.1", "E.1", "acumulativediff"), row.names = c(NA,-11L), class = "data.frame") 

The idea is to replace 0's from A.1, B.1, C.1 columns with the values of 'acumulativediff' column, based on two conditions:

df[i,1]  == df[i-1,1] & df[i,names] == "1" & df[i-1,names] == "1", df[i,diff]
df[i,1]  == df[i-1,1] & df[i,names] == "0" & df[i-1,names] == "1", df[i,diff]

I was able to do it, using a non-efficient loop-for which seems to work on small df but not with bigger ones (it takes almost two hours)

names <- colnames(df[3:7])
names2 <- colnames(df[8:12])
diff <- which(colnames(df)=="acumulativediff")
for (i in 2:nrow(df)){
df[i,names2] <- ifelse (df[i,1]  == df[i-1,1] & df[i,names] == "1" & 
df[i-1,names] == "1", df[i,diff],
      ifelse (df[i,1]  == df[i-1,1] & df[i,names] == "0" & df[i-1,names] == "1", df[i,diff], 0))}

Any idea or advice to omit the loop to achieve a more efficient code?

I'll suggest to ignore A.1, B.1 etc columns. Just re-create those columns using dplyr::mutate_at and the rules specified by OP . The dplyr::lag with default = 0 will help to avoid NA in result.

library(dplyr)

df %>% select(-ends_with(".1")) %>%
  mutate_at(vars(A:E), 
       funs(l = ifelse(lag(id)==id & lag(., default=0) == "1",acumulativediff,0)))


#    id       date A B C D E acumulativediff  A_l  B_l  C_l  D_l  E_l
# 1   1 2015-01-15 0 1 0 0 1               0    0    0    0    0    0
# 2   2 2004-03-01 1 0 1 0 1               0    0    0    0    0    0
# 3   2 2017-03-15 1 1 0 0 1            4762 4762    0 4762    0 4762
# 4   3 2000-01-15 0 0 0 1 0               0    0    0    0    0    0
# 5   4 2006-05-08 1 1 0 1 0               0    0    0    0    0    0
# 6   4 2008-05-09 1 0 1 1 0             732  732  732    0  732    0
# 7   4 2014-05-11 0 0 1 1 0            2925 2925    0 2925 2925    0
# 8   4 2014-06-11 0 0 1 0 0            2956    0    0 2956 2956    0
# 9   4 2014-07-11 1 1 1 1 1            2986    0    0 2986    0    0
# 10  4 2014-08-11 1 1 1 0 1            3017 3017 3017 3017 3017 3017
# 11  5 2015-12-19 1 1 0 1 1               0    0    0    0    0    0

You can also try this. A group_by replaces parts of the used ifelse approach used in the other answer. Here a case_when is used to check if lag() == 1 , which is enough IMO.

df %>% 
 select(-ends_with(".1")) %>% 
 group_by(id) %>% 
 mutate_at(.vars = vars(A:E), funs("1"=case_when(lag(.) == 1 ~ acumulativediff, TRUE ~ 0))) %>% 
 ungroup()
# A tibble: 11 x 13
   id    date           A     B     C     D     E acumulativediff  A_1  B_1  C_1  D_1  E_1
   <fct> <fct>      <dbl> <dbl> <dbl> <dbl> <dbl>           <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 1     2015-01-15     0     1     0     0     1               0     0     0     0     0     0
 2 2     2004-03-01     1     0     1     0     1               0     0     0     0     0     0
 3 2     2017-03-15     1     1     0     0     1            4762  4762     0  4762     0  4762
 4 3     2000-01-15     0     0     0     1     0               0     0     0     0     0     0
 5 4     2006-05-08     1     1     0     1     0               0     0     0     0     0     0
 6 4     2008-05-09     1     0     1     1     0             732   732   732     0   732     0
 7 4     2014-05-11     0     0     1     1     0            2925  2925     0  2925  2925     0
 8 4     2014-06-11     0     0     1     0     0            2956     0     0  2956  2956     0
 9 4     2014-07-11     1     1     1     1     1            2986     0     0  2986     0     0
10 4     2014-08-11     1     1     1     0     1            3017  3017  3017  3017  3017  3017
11 5     2015-12-19     1     1     0     1     1               0     0     0     0     0     0

This df[i,1] == df[i-1,1] condition can be replaced with grouping by id column. Another point is that if you have only "0" or "1" in columns A , B etc. then condition ( df[i,names] == "1" & df[i-1,names] == "1" or df[i,names] == "0" & df[i-1,names] == "1" ) can be simplified to only (df[i-1,names] == "1") which is equivalent to the lag of df[,names] .

I propose a data.table solution where lag is defined by shift function. Frankly speaking it's not an example of good coding due to use of eval(parse()) constructions but I hope it should be easier to understand the solution with them.

library(data.table)

setDT(df)

bin_names <- LETTERS[1:5]
# [1] "A" "B" "C" "D" "E"
bin_names.1 <- paste0(bin_names, ".1")
# [1] "A.1" "B.1" "C.1" "D.1" "E.1"

# slicing table in parts with "by" parameter and compute columns "A.1", "B.1" etc. in for loop
for (i in seq_along(bin_names)) df[, eval(bin_names.1[i]) := shift(as.numeric(eval(parse(text = bin_names[i]))))*acumulativediff, by = .(id)]
df[]
#     id       date A B C D E  A.1  B.1  C.1  D.1  E.1 acumulativediff
#  1:  1 2015-01-15 0 1 0 0 1   NA   NA   NA   NA   NA               0
#  2:  2 2004-03-01 1 0 1 0 1   NA   NA   NA   NA   NA               0
#  3:  2 2017-03-15 1 1 0 0 1 4762    0 4762    0 4762            4762
#  4:  3 2000-01-15 0 0 0 1 0   NA   NA   NA   NA   NA               0
#  5:  4 2006-05-08 1 1 0 1 0   NA   NA   NA   NA   NA               0
#  6:  4 2008-05-09 1 0 1 1 0  732  732    0  732    0             732
#  7:  4 2014-05-11 0 0 1 1 0 2925    0 2925 2925    0            2925
#  8:  4 2014-06-11 0 0 1 0 0    0    0 2956 2956    0            2956
#  9:  4 2014-07-11 1 1 1 1 1    0    0 2986    0    0            2986
# 10:  4 2014-08-11 1 1 1 0 1 3017 3017 3017 3017 3017            3017
# 11:  5 2015-12-19 1 1 0 1 1   NA   NA   NA   NA   NA               0

If you don't like NA s in your table you can make a little more job to fix it.

fillna <- function(x, fill = 0) {x[is.na(x)] <- fill; return(x)}
for (nm in bin_names.1) df[, eval(nm) := fillna(eval(parse(text = nm)))]
df[]
#     id       date A B C D E  A.1  B.1  C.1  D.1  E.1 acumulativediff
#  1:  1 2015-01-15 0 1 0 0 1    0    0    0    0    0               0
#  2:  2 2004-03-01 1 0 1 0 1    0    0    0    0    0               0
#  3:  2 2017-03-15 1 1 0 0 1 4762    0 4762    0 4762            4762
#  4:  3 2000-01-15 0 0 0 1 0    0    0    0    0    0               0
#  5:  4 2006-05-08 1 1 0 1 0    0    0    0    0    0               0
#  6:  4 2008-05-09 1 0 1 1 0  732  732    0  732    0             732
#  7:  4 2014-05-11 0 0 1 1 0 2925    0 2925 2925    0            2925
#  8:  4 2014-06-11 0 0 1 0 0    0    0 2956 2956    0            2956
#  9:  4 2014-07-11 1 1 1 1 1    0    0 2986    0    0            2986
# 10:  4 2014-08-11 1 1 1 0 1 3017 3017 3017 3017 3017            3017
# 11:  5 2015-12-19 1 1 0 1 1    0    0    0    0    0               0

Another option is to use shift with fill = 0 parameter to have zeros immediately.

shift(as.numeric(eval(parse(text = bin_names[i]))), fill = 0)*acumulativediff

Just noticed you actually want operations grouped by ID, in this case my answer doesn't provide the correct result.

For loops aren't always inherently slower -- iterating by row is expensive, but iterating by column should cause too much overhead, the only way to vectorize it fully would be to use matrix methods.

This should perform as well or similarly to most of the one-liners, but future-you might appreciate the readability.

setDT(df)

Suffix <- ".1"
SuffixedNames <- intersect(names(df),paste0(names(df),Suffix))
RawNames <- intersect(names(df),gsub(Suffix,"",SuffixedNames))

for (x in seq_along(RawNames)){

  thisRawName <- RawNames[[x]]
  thisSuffixedName <- SuffixedNames[[x]]

  Raw <- df[[thisRawName]]
  ## Using the shift() function from the data.table package
  Lagged <- shift(Raw, n = 1L, type = "lag", fill = -1L)

  ## Using set() from the data.table package
  set(df, j = thisSuffixedName, value = ifelse((Raw == Lagged & Raw == 1L & Lagged == 1L) | (Raw == 0L & Lagged == 1L),
                                    df[["acumulativediff"]],
                                    0L))
}

In base R :

df2 <- df
# first we ignore id
df2[-1,8:12] <- df[-nrow(df),3:7] * df[-1,13]
# then we make sure rows of 1st id are 0
df2[which(diff(as.numeric(df$id))==1)+1,8:12] <- 0

#    id       date A B C D E  A.1  B.1  C.1  D.1  E.1 acumulativediff
# 1   1 2015-01-15 0 1 0 0 1    0    0    0    0    0               0
# 2   2 2004-03-01 1 0 1 0 1    0    0    0    0    0               0
# 3   2 2017-03-15 1 1 0 0 1 4762    0 4762    0 4762            4762
# 4   3 2000-01-15 0 0 0 1 0    0    0    0    0    0               0
# 5   4 2006-05-08 1 1 0 1 0    0    0    0    0    0               0
# 6   4 2008-05-09 1 0 1 1 0  732  732    0  732    0             732
# 7   4 2014-05-11 0 0 1 1 0 2925    0 2925 2925    0            2925
# 8   4 2014-06-11 0 0 1 0 0    0    0 2956 2956    0            2956
# 9   4 2014-07-11 1 1 1 1 1    0    0 2986    0    0            2986
# 10  4 2014-08-11 1 1 1 0 1 3017 3017 3017 3017 3017            3017
# 11  5 2015-12-19 1 1 0 1 1    0    0    0    0    0               0

Here's a benchmark comparing to @MKR's current solution, on given dataset and on simulated dataset of ~100 k rows. Mine's around 5 times faster on my machine in any case.

mm <- function(df){
df[-1,8:12] <- df[-nrow(df),3:7] * df[-1,13]
df[which(diff(as.numeric(df$id))==1)+1,8:12] <- 0
df}

mkr <- function(df){df %>% select(-ends_with(".1")) %>%
  mutate_at(vars(A:E), 
funs(l = ifelse(lag(id)==id & lag(., default=0) == "1",acumulativediff,0)))}

microbenchmark::microbenchmark(mm(df),mkr(df),unit="relative")
# Unit: relative
#     expr      min       lq     mean   median       uq      max neval
#   mm(df) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000   100
#  mkr(df) 7.788748 7.666287 5.265091 6.755467 6.655934 1.291942   100


big <- do.call(rbind,replicate(10000,df,F))
big$id <- data.table::rleid(big$id)

microbenchmark::microbenchmark(mm(big),mkr(big),unit="relative")
# Unit: relative
#     expr      min       lq     mean   median       uq      max neval
#  mm(big) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000   100
# mkr(big) 7.065627 4.945323 4.429752 4.910065 4.566391 1.765609   100

Does this work for you?

df <-data.frame(id=c("1","2","2","3","4","4", "4", "4", "4", "4", "5"), 
                date = c("2015-01-15", "2004-03-01", "2017-03-15", "2000-01-15", "2006-05-08", 
                         "2008-05-09", "2014-05-11", "2014-06-11", "2014-07-11", "2014-08-11", "2015-12-19"), 
                A =c (0,1,1,0,1,1,0,0,1,1,1), B=c(1,0,1,0,1,0,0,0,1,1,1), C = c(0,1,0,0,0,1,1,1,1,1,0), 
                D = c(0,0,0,1,1,1,1,0,1,0,1), E = c(1,1,1,0,0,0,0,0,1,1,1), A.1 = c(0,0,0,0,0,0,0,0,0,0,0), 
                B.1 = c(0,0,0,0,0,0,0,0,0,0,0), C.1 = c(0,0,0,0,0,0,0,0,0,0,0), D.1 = c(0,0,0,0,0,0,0,0,0,0,0), 
                E.1 = c(0,0,0,0,0,0,0,0,0,0,0), acumulativediff = c(0, 0, 4762, 0, 0, 732, 2925, 2956, 2986, 3017, 0),
                stringsAsFactors = FALSE)
df2 <- df
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                0), D.1 = c(0, 0, 0, 0, 0, 732, 2925, 2956, 0, 3017, 0),E.1 = c(0, 0, 4762, 0, 0, 0, 0, 0, 0, 3017, 0), acumulativediff = c(0, 0, 4762, 0, 0, 732, 2925, 2956, 2986, 3017, 0)), .Names = c("id","date", "A", "B", "C", "D", "E", "A.1", "B.1", "C.1", "D.1", "E.1", "acumulativediff"), row.names = c(NA,-11L), class = "data.frame") 
names <- colnames(df[3:7])
names2 <- colnames(df[8:12])
diff <- which(colnames(df)=="acumulativediff")

df2[,names2] <- ifelse(df[,1] == dplyr::lag(df[,1]) & df[,names] == "1" & 
                         dplyr::lag(df[,names]) == "1",
                       df[,diff],
                       ifelse (df[,1]  == dplyr::lag(df[,1]) & df[,names] == "0" & 
                                 dplyr::lag(df[,names]) == "1", df[,diff], 0))

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM