R Replace values based on conditions (for same ID) without using for-loop

Question

I have a df similar to this one but much bigger (100.000 rows x 100 columns)

df <-data.frame(id=c("1","2","2","3","4","4", "4", "4", "4", "4", "5"), date = c("2015-01-15", "2004-03-01", "2017-03-15", "2000-01-15", "2006-05-08", "2008-05-09", "2014-05-11", "2014-06-11", "2014-07-11", "2014-08-11", "2015-12-19"), A =c (0,1,1,0,1,1,0,0,1,1,1), B=c(1,0,1,0,1,0,0,0,1,1,1), C = c(0,1,0,0,0,1,1,1,1,1,0), D = c(0,0,0,1,1,1,1,0,1,0,1), E = c(1,1,1,0,0,0,0,0,1,1,1), A.1 = c(0,0,0,0,0,0,0,0,0,0,0), B.1 = c(0,0,0,0,0,0,0,0,0,0,0), C.1 = c(0,0,0,0,0,0,0,0,0,0,0), D.1 = c(0,0,0,0,0,0,0,0,0,0,0), E.1 = c(0,0,0,0,0,0,0,0,0,0,0), acumulativediff = c(0, 0, 4762, 0, 0, 732, 2925, 2956, 2986, 3017, 0))

What I have to accomplish is this:

structure(list(id = structure(c(1L, 2L, 2L, 3L, 4L, 4L, 4L, 4L, 4L, 4L,5L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), date = structure(c(9L, 2L, 11L, 1L, 3L, 4L, 5L, 6L, 7L, 8L,10L), .Label = c("2000-01-15", "2004-03-01", "2006-05-08","2008-05-09", "2014-05-11", "2014-06-11", "2014-07-11", "2014-08-11","2015-01-15", "2015-12-19", "2017-03-15"), class = "factor"), A = c(0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1), B = c(1, 0, 1, 0,1, 0, 0, 0, 1, 1, 1), C = c(0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0), D = c(0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1), E = c(1, 1, 1,0, 0, 0, 0, 0, 1, 1, 1), A.1 = c(0, 0, 4762, 0, 0, 732, 2925,0, 0, 3017, 0), B.1 = c(0, 0, 0, 0, 0, 732, 0, 0, 0, 3017,0), C.1 = c(0, 0, 4762, 0, 0, 0, 2925, 2956, 2986, 3017,
0), D.1 = c(0, 0, 0, 0, 0, 732, 2925, 2956, 0, 3017, 0),E.1 = c(0, 0, 4762, 0, 0, 0, 0, 0, 0, 3017, 0), acumulativediff = c(0, 0, 4762, 0, 0, 732, 2925, 2956, 2986, 3017, 0)), .Names = c("id","date", "A", "B", "C", "D", "E", "A.1", "B.1", "C.1", "D.1", "E.1", "acumulativediff"), row.names = c(NA,-11L), class = "data.frame")

The idea is to replace 0's from A.1, B.1, C.1 columns with the values of 'acumulativediff' column, based on two conditions:

df[i,1]  == df[i-1,1] & df[i,names] == "1" & df[i-1,names] == "1", df[i,diff]
df[i,1]  == df[i-1,1] & df[i,names] == "0" & df[i-1,names] == "1", df[i,diff]

I was able to do it, using a non-efficient loop-for which seems to work on small df but not with bigger ones (it takes almost two hours)

names <- colnames(df[3:7])
names2 <- colnames(df[8:12])
diff <- which(colnames(df)=="acumulativediff")
for (i in 2:nrow(df)){
df[i,names2] <- ifelse (df[i,1]  == df[i-1,1] & df[i,names] == "1" & 
df[i-1,names] == "1", df[i,diff],
      ifelse (df[i,1]  == df[i-1,1] & df[i,names] == "0" & df[i-1,names] == "1", df[i,diff], 0))}

Any idea or advice to omit the loop to achieve a more efficient code?

Answer 1

I'll suggest to ignore A.1, B.1 etc columns. Just re-create those columns using dplyr::mutate_at and the rules specified by OP . The dplyr::lag with default = 0 will help to avoid NA in result.

library(dplyr)

df %>% select(-ends_with(".1")) %>%
  mutate_at(vars(A:E), 
       funs(l = ifelse(lag(id)==id & lag(., default=0) == "1",acumulativediff,0)))


#    id       date A B C D E acumulativediff  A_l  B_l  C_l  D_l  E_l
# 1   1 2015-01-15 0 1 0 0 1               0    0    0    0    0    0
# 2   2 2004-03-01 1 0 1 0 1               0    0    0    0    0    0
# 3   2 2017-03-15 1 1 0 0 1            4762 4762    0 4762    0 4762
# 4   3 2000-01-15 0 0 0 1 0               0    0    0    0    0    0
# 5   4 2006-05-08 1 1 0 1 0               0    0    0    0    0    0
# 6   4 2008-05-09 1 0 1 1 0             732  732  732    0  732    0
# 7   4 2014-05-11 0 0 1 1 0            2925 2925    0 2925 2925    0
# 8   4 2014-06-11 0 0 1 0 0            2956    0    0 2956 2956    0
# 9   4 2014-07-11 1 1 1 1 1            2986    0    0 2986    0    0
# 10  4 2014-08-11 1 1 1 0 1            3017 3017 3017 3017 3017 3017
# 11  5 2015-12-19 1 1 0 1 1               0    0    0    0    0    0

Answer 2

You can also try this. A group_by replaces parts of the used ifelse approach used in the other answer. Here a case_when is used to check if lag() == 1 , which is enough IMO.

df %>% 
 select(-ends_with(".1")) %>% 
 group_by(id) %>% 
 mutate_at(.vars = vars(A:E), funs("1"=case_when(lag(.) == 1 ~ acumulativediff, TRUE ~ 0))) %>% 
 ungroup()
# A tibble: 11 x 13
   id    date           A     B     C     D     E acumulativediff  A_1  B_1  C_1  D_1  E_1
   <fct> <fct>      <dbl> <dbl> <dbl> <dbl> <dbl>           <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 1     2015-01-15     0     1     0     0     1               0     0     0     0     0     0
 2 2     2004-03-01     1     0     1     0     1               0     0     0     0     0     0
 3 2     2017-03-15     1     1     0     0     1            4762  4762     0  4762     0  4762
 4 3     2000-01-15     0     0     0     1     0               0     0     0     0     0     0
 5 4     2006-05-08     1     1     0     1     0               0     0     0     0     0     0
 6 4     2008-05-09     1     0     1     1     0             732   732   732     0   732     0
 7 4     2014-05-11     0     0     1     1     0            2925  2925     0  2925  2925     0
 8 4     2014-06-11     0     0     1     0     0            2956     0     0  2956  2956     0
 9 4     2014-07-11     1     1     1     1     1            2986     0     0  2986     0     0
10 4     2014-08-11     1     1     1     0     1            3017  3017  3017  3017  3017  3017
11 5     2015-12-19     1     1     0     1     1               0     0     0     0     0     0

Answer 3

This df[i,1] == df[i-1,1] condition can be replaced with grouping by id column. Another point is that if you have only "0" or "1" in columns A , B etc. then condition ( df[i,names] == "1" & df[i-1,names] == "1" or df[i,names] == "0" & df[i-1,names] == "1" ) can be simplified to only (df[i-1,names] == "1") which is equivalent to the lag of df[,names] .

I propose a data.table solution where lag is defined by shift function. Frankly speaking it's not an example of good coding due to use of eval(parse()) constructions but I hope it should be easier to understand the solution with them.

library(data.table)

setDT(df)

bin_names <- LETTERS[1:5]
# [1] "A" "B" "C" "D" "E"
bin_names.1 <- paste0(bin_names, ".1")
# [1] "A.1" "B.1" "C.1" "D.1" "E.1"

# slicing table in parts with "by" parameter and compute columns "A.1", "B.1" etc. in for loop
for (i in seq_along(bin_names)) df[, eval(bin_names.1[i]) := shift(as.numeric(eval(parse(text = bin_names[i]))))*acumulativediff, by = .(id)]
df[]
#     id       date A B C D E  A.1  B.1  C.1  D.1  E.1 acumulativediff
#  1:  1 2015-01-15 0 1 0 0 1   NA   NA   NA   NA   NA               0
#  2:  2 2004-03-01 1 0 1 0 1   NA   NA   NA   NA   NA               0
#  3:  2 2017-03-15 1 1 0 0 1 4762    0 4762    0 4762            4762
#  4:  3 2000-01-15 0 0 0 1 0   NA   NA   NA   NA   NA               0
#  5:  4 2006-05-08 1 1 0 1 0   NA   NA   NA   NA   NA               0
#  6:  4 2008-05-09 1 0 1 1 0  732  732    0  732    0             732
#  7:  4 2014-05-11 0 0 1 1 0 2925    0 2925 2925    0            2925
#  8:  4 2014-06-11 0 0 1 0 0    0    0 2956 2956    0            2956
#  9:  4 2014-07-11 1 1 1 1 1    0    0 2986    0    0            2986
# 10:  4 2014-08-11 1 1 1 0 1 3017 3017 3017 3017 3017            3017
# 11:  5 2015-12-19 1 1 0 1 1   NA   NA   NA   NA   NA               0

If you don't like NA s in your table you can make a little more job to fix it.

fillna <- function(x, fill = 0) {x[is.na(x)] <- fill; return(x)}
for (nm in bin_names.1) df[, eval(nm) := fillna(eval(parse(text = nm)))]
df[]
#     id       date A B C D E  A.1  B.1  C.1  D.1  E.1 acumulativediff
#  1:  1 2015-01-15 0 1 0 0 1    0    0    0    0    0               0
#  2:  2 2004-03-01 1 0 1 0 1    0    0    0    0    0               0
#  3:  2 2017-03-15 1 1 0 0 1 4762    0 4762    0 4762            4762
#  4:  3 2000-01-15 0 0 0 1 0    0    0    0    0    0               0
#  5:  4 2006-05-08 1 1 0 1 0    0    0    0    0    0               0
#  6:  4 2008-05-09 1 0 1 1 0  732  732    0  732    0             732
#  7:  4 2014-05-11 0 0 1 1 0 2925    0 2925 2925    0            2925
#  8:  4 2014-06-11 0 0 1 0 0    0    0 2956 2956    0            2956
#  9:  4 2014-07-11 1 1 1 1 1    0    0 2986    0    0            2986
# 10:  4 2014-08-11 1 1 1 0 1 3017 3017 3017 3017 3017            3017
# 11:  5 2015-12-19 1 1 0 1 1    0    0    0    0    0               0

Another option is to use shift with fill = 0 parameter to have zeros immediately.

shift(as.numeric(eval(parse(text = bin_names[i]))), fill = 0)*acumulativediff

Answer 4

Just noticed you actually want operations grouped by ID, in this case my answer doesn't provide the correct result.

For loops aren't always inherently slower -- iterating by row is expensive, but iterating by column should cause too much overhead, the only way to vectorize it fully would be to use matrix methods.

~~This should perform as well or similarly to most of the one-liners, but future-you might appreciate the readability.~~

setDT(df)

Suffix <- ".1"
SuffixedNames <- intersect(names(df),paste0(names(df),Suffix))
RawNames <- intersect(names(df),gsub(Suffix,"",SuffixedNames))

for (x in seq_along(RawNames)){

  thisRawName <- RawNames[[x]]
  thisSuffixedName <- SuffixedNames[[x]]

  Raw <- df[[thisRawName]]
  ## Using the shift() function from the data.table package
  Lagged <- shift(Raw, n = 1L, type = "lag", fill = -1L)

  ## Using set() from the data.table package
  set(df, j = thisSuffixedName, value = ifelse((Raw == Lagged & Raw == 1L & Lagged == 1L) | (Raw == 0L & Lagged == 1L),
                                    df[["acumulativediff"]],
                                    0L))
}

Answer 5

In base R :

df2 <- df
# first we ignore id
df2[-1,8:12] <- df[-nrow(df),3:7] * df[-1,13]
# then we make sure rows of 1st id are 0
df2[which(diff(as.numeric(df$id))==1)+1,8:12] <- 0

#    id       date A B C D E  A.1  B.1  C.1  D.1  E.1 acumulativediff
# 1   1 2015-01-15 0 1 0 0 1    0    0    0    0    0               0
# 2   2 2004-03-01 1 0 1 0 1    0    0    0    0    0               0
# 3   2 2017-03-15 1 1 0 0 1 4762    0 4762    0 4762            4762
# 4   3 2000-01-15 0 0 0 1 0    0    0    0    0    0               0
# 5   4 2006-05-08 1 1 0 1 0    0    0    0    0    0               0
# 6   4 2008-05-09 1 0 1 1 0  732  732    0  732    0             732
# 7   4 2014-05-11 0 0 1 1 0 2925    0 2925 2925    0            2925
# 8   4 2014-06-11 0 0 1 0 0    0    0 2956 2956    0            2956
# 9   4 2014-07-11 1 1 1 1 1    0    0 2986    0    0            2986
# 10  4 2014-08-11 1 1 1 0 1 3017 3017 3017 3017 3017            3017
# 11  5 2015-12-19 1 1 0 1 1    0    0    0    0    0               0

Here's a benchmark comparing to @MKR's current solution, on given dataset and on simulated dataset of ~100 k rows. Mine's around 5 times faster on my machine in any case.

mm <- function(df){
df[-1,8:12] <- df[-nrow(df),3:7] * df[-1,13]
df[which(diff(as.numeric(df$id))==1)+1,8:12] <- 0
df}

mkr <- function(df){df %>% select(-ends_with(".1")) %>%
  mutate_at(vars(A:E), 
funs(l = ifelse(lag(id)==id & lag(., default=0) == "1",acumulativediff,0)))}

microbenchmark::microbenchmark(mm(df),mkr(df),unit="relative")
# Unit: relative
#     expr      min       lq     mean   median       uq      max neval
#   mm(df) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000   100
#  mkr(df) 7.788748 7.666287 5.265091 6.755467 6.655934 1.291942   100


big <- do.call(rbind,replicate(10000,df,F))
big$id <- data.table::rleid(big$id)

microbenchmark::microbenchmark(mm(big),mkr(big),unit="relative")
# Unit: relative
#     expr      min       lq     mean   median       uq      max neval
#  mm(big) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000   100
# mkr(big) 7.065627 4.945323 4.429752 4.910065 4.566391 1.765609   100

Answer 6

Does this work for you?

df <-data.frame(id=c("1","2","2","3","4","4", "4", "4", "4", "4", "5"), 
                date = c("2015-01-15", "2004-03-01", "2017-03-15", "2000-01-15", "2006-05-08", 
                         "2008-05-09", "2014-05-11", "2014-06-11", "2014-07-11", "2014-08-11", "2015-12-19"), 
                A =c (0,1,1,0,1,1,0,0,1,1,1), B=c(1,0,1,0,1,0,0,0,1,1,1), C = c(0,1,0,0,0,1,1,1,1,1,0), 
                D = c(0,0,0,1,1,1,1,0,1,0,1), E = c(1,1,1,0,0,0,0,0,1,1,1), A.1 = c(0,0,0,0,0,0,0,0,0,0,0), 
                B.1 = c(0,0,0,0,0,0,0,0,0,0,0), C.1 = c(0,0,0,0,0,0,0,0,0,0,0), D.1 = c(0,0,0,0,0,0,0,0,0,0,0), 
                E.1 = c(0,0,0,0,0,0,0,0,0,0,0), acumulativediff = c(0, 0, 4762, 0, 0, 732, 2925, 2956, 2986, 3017, 0),
                stringsAsFactors = FALSE)
df2 <- df
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                0), D.1 = c(0, 0, 0, 0, 0, 732, 2925, 2956, 0, 3017, 0),E.1 = c(0, 0, 4762, 0, 0, 0, 0, 0, 0, 3017, 0), acumulativediff = c(0, 0, 4762, 0, 0, 732, 2925, 2956, 2986, 3017, 0)), .Names = c("id","date", "A", "B", "C", "D", "E", "A.1", "B.1", "C.1", "D.1", "E.1", "acumulativediff"), row.names = c(NA,-11L), class = "data.frame") 
names <- colnames(df[3:7])
names2 <- colnames(df[8:12])
diff <- which(colnames(df)=="acumulativediff")

df2[,names2] <- ifelse(df[,1] == dplyr::lag(df[,1]) & df[,names] == "1" & 
                         dplyr::lag(df[,names]) == "1",
                       df[,diff],
                       ifelse (df[,1]  == dplyr::lag(df[,1]) & df[,names] == "0" & 
                                 dplyr::lag(df[,names]) == "1", df[,diff], 0))

R Replace values based on conditions (for same ID) without using for-loop

Question

6 answers

solution1
3 ACCPTED 2018-05-30 13:48:22

solution2
1 2018-05-30 13:53:34

solution3
1 2018-05-30 14:15:40

solution4
1 2018-05-30 15:00:58

solution5
1 2018-06-02 13:40:47

solution6
0 2018-05-30 13:35:58

R Replace values based on conditions (for same ID) without using for-loop

Question

6 answers

solution1 3 ACCPTED 2018-05-30 13:48:22

solution2 1 2018-05-30 13:53:34

solution3 1 2018-05-30 14:15:40

solution4 1 2018-05-30 15:00:58

solution5 1 2018-06-02 13:40:47

solution6 0 2018-05-30 13:35:58

solution1
3 ACCPTED 2018-05-30 13:48:22

solution2
1 2018-05-30 13:53:34

solution3
1 2018-05-30 14:15:40

solution4
1 2018-05-30 15:00:58

solution5
1 2018-06-02 13:40:47

solution6
0 2018-05-30 13:35:58