I have a df similar to this one but much bigger (100.000 rows x 100 columns)
df <-data.frame(id=c("1","2","2","3","4","4", "4", "4", "4", "4", "5"), date = c("2015-01-15", "2004-03-01", "2017-03-15", "2000-01-15", "2006-05-08", "2008-05-09", "2014-05-11", "2014-06-11", "2014-07-11", "2014-08-11", "2015-12-19"), A =c (0,1,1,0,1,1,0,0,1,1,1), B=c(1,0,1,0,1,0,0,0,1,1,1), C = c(0,1,0,0,0,1,1,1,1,1,0), D = c(0,0,0,1,1,1,1,0,1,0,1), E = c(1,1,1,0,0,0,0,0,1,1,1), A.1 = c(0,0,0,0,0,0,0,0,0,0,0), B.1 = c(0,0,0,0,0,0,0,0,0,0,0), C.1 = c(0,0,0,0,0,0,0,0,0,0,0), D.1 = c(0,0,0,0,0,0,0,0,0,0,0), E.1 = c(0,0,0,0,0,0,0,0,0,0,0), acumulativediff = c(0, 0, 4762, 0, 0, 732, 2925, 2956, 2986, 3017, 0))
What I have to accomplish is this:
structure(list(id = structure(c(1L, 2L, 2L, 3L, 4L, 4L, 4L, 4L, 4L, 4L,5L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), date = structure(c(9L, 2L, 11L, 1L, 3L, 4L, 5L, 6L, 7L, 8L,10L), .Label = c("2000-01-15", "2004-03-01", "2006-05-08","2008-05-09", "2014-05-11", "2014-06-11", "2014-07-11", "2014-08-11","2015-01-15", "2015-12-19", "2017-03-15"), class = "factor"), A = c(0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1), B = c(1, 0, 1, 0,1, 0, 0, 0, 1, 1, 1), C = c(0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0), D = c(0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1), E = c(1, 1, 1,0, 0, 0, 0, 0, 1, 1, 1), A.1 = c(0, 0, 4762, 0, 0, 732, 2925,0, 0, 3017, 0), B.1 = c(0, 0, 0, 0, 0, 732, 0, 0, 0, 3017,0), C.1 = c(0, 0, 4762, 0, 0, 0, 2925, 2956, 2986, 3017,
0), D.1 = c(0, 0, 0, 0, 0, 732, 2925, 2956, 0, 3017, 0),E.1 = c(0, 0, 4762, 0, 0, 0, 0, 0, 0, 3017, 0), acumulativediff = c(0, 0, 4762, 0, 0, 732, 2925, 2956, 2986, 3017, 0)), .Names = c("id","date", "A", "B", "C", "D", "E", "A.1", "B.1", "C.1", "D.1", "E.1", "acumulativediff"), row.names = c(NA,-11L), class = "data.frame")
The idea is to replace 0's from A.1, B.1, C.1 columns with the values of 'acumulativediff' column, based on two conditions:
df[i,1] == df[i-1,1] & df[i,names] == "1" & df[i-1,names] == "1", df[i,diff]
df[i,1] == df[i-1,1] & df[i,names] == "0" & df[i-1,names] == "1", df[i,diff]
I was able to do it, using a non-efficient loop-for which seems to work on small df but not with bigger ones (it takes almost two hours)
names <- colnames(df[3:7])
names2 <- colnames(df[8:12])
diff <- which(colnames(df)=="acumulativediff")
for (i in 2:nrow(df)){
df[i,names2] <- ifelse (df[i,1] == df[i-1,1] & df[i,names] == "1" &
df[i-1,names] == "1", df[i,diff],
ifelse (df[i,1] == df[i-1,1] & df[i,names] == "0" & df[i-1,names] == "1", df[i,diff], 0))}
Any idea or advice to omit the loop to achieve a more efficient code?
I'll suggest to ignore A.1, B.1 etc
columns. Just re-create those columns using dplyr::mutate_at
and the rules specified by OP
. The dplyr::lag
with default = 0
will help to avoid NA
in result.
library(dplyr)
df %>% select(-ends_with(".1")) %>%
mutate_at(vars(A:E),
funs(l = ifelse(lag(id)==id & lag(., default=0) == "1",acumulativediff,0)))
# id date A B C D E acumulativediff A_l B_l C_l D_l E_l
# 1 1 2015-01-15 0 1 0 0 1 0 0 0 0 0 0
# 2 2 2004-03-01 1 0 1 0 1 0 0 0 0 0 0
# 3 2 2017-03-15 1 1 0 0 1 4762 4762 0 4762 0 4762
# 4 3 2000-01-15 0 0 0 1 0 0 0 0 0 0 0
# 5 4 2006-05-08 1 1 0 1 0 0 0 0 0 0 0
# 6 4 2008-05-09 1 0 1 1 0 732 732 732 0 732 0
# 7 4 2014-05-11 0 0 1 1 0 2925 2925 0 2925 2925 0
# 8 4 2014-06-11 0 0 1 0 0 2956 0 0 2956 2956 0
# 9 4 2014-07-11 1 1 1 1 1 2986 0 0 2986 0 0
# 10 4 2014-08-11 1 1 1 0 1 3017 3017 3017 3017 3017 3017
# 11 5 2015-12-19 1 1 0 1 1 0 0 0 0 0 0
You can also try this. A group_by
replaces parts of the used ifelse approach used in the other answer. Here a case_when
is used to check if lag() == 1
, which is enough IMO.
df %>%
select(-ends_with(".1")) %>%
group_by(id) %>%
mutate_at(.vars = vars(A:E), funs("1"=case_when(lag(.) == 1 ~ acumulativediff, TRUE ~ 0))) %>%
ungroup()
# A tibble: 11 x 13
id date A B C D E acumulativediff A_1 B_1 C_1 D_1 E_1
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2015-01-15 0 1 0 0 1 0 0 0 0 0 0
2 2 2004-03-01 1 0 1 0 1 0 0 0 0 0 0
3 2 2017-03-15 1 1 0 0 1 4762 4762 0 4762 0 4762
4 3 2000-01-15 0 0 0 1 0 0 0 0 0 0 0
5 4 2006-05-08 1 1 0 1 0 0 0 0 0 0 0
6 4 2008-05-09 1 0 1 1 0 732 732 732 0 732 0
7 4 2014-05-11 0 0 1 1 0 2925 2925 0 2925 2925 0
8 4 2014-06-11 0 0 1 0 0 2956 0 0 2956 2956 0
9 4 2014-07-11 1 1 1 1 1 2986 0 0 2986 0 0
10 4 2014-08-11 1 1 1 0 1 3017 3017 3017 3017 3017 3017
11 5 2015-12-19 1 1 0 1 1 0 0 0 0 0 0
This df[i,1] == df[i-1,1]
condition can be replaced with grouping by id
column. Another point is that if you have only "0" or "1" in columns A
, B
etc. then condition ( df[i,names] == "1" & df[i-1,names] == "1"
or df[i,names] == "0" & df[i-1,names] == "1"
) can be simplified to only (df[i-1,names] == "1")
which is equivalent to the lag
of df[,names]
.
I propose a data.table
solution where lag is defined by shift
function. Frankly speaking it's not an example of good coding due to use of eval(parse())
constructions but I hope it should be easier to understand the solution with them.
library(data.table)
setDT(df)
bin_names <- LETTERS[1:5]
# [1] "A" "B" "C" "D" "E"
bin_names.1 <- paste0(bin_names, ".1")
# [1] "A.1" "B.1" "C.1" "D.1" "E.1"
# slicing table in parts with "by" parameter and compute columns "A.1", "B.1" etc. in for loop
for (i in seq_along(bin_names)) df[, eval(bin_names.1[i]) := shift(as.numeric(eval(parse(text = bin_names[i]))))*acumulativediff, by = .(id)]
df[]
# id date A B C D E A.1 B.1 C.1 D.1 E.1 acumulativediff
# 1: 1 2015-01-15 0 1 0 0 1 NA NA NA NA NA 0
# 2: 2 2004-03-01 1 0 1 0 1 NA NA NA NA NA 0
# 3: 2 2017-03-15 1 1 0 0 1 4762 0 4762 0 4762 4762
# 4: 3 2000-01-15 0 0 0 1 0 NA NA NA NA NA 0
# 5: 4 2006-05-08 1 1 0 1 0 NA NA NA NA NA 0
# 6: 4 2008-05-09 1 0 1 1 0 732 732 0 732 0 732
# 7: 4 2014-05-11 0 0 1 1 0 2925 0 2925 2925 0 2925
# 8: 4 2014-06-11 0 0 1 0 0 0 0 2956 2956 0 2956
# 9: 4 2014-07-11 1 1 1 1 1 0 0 2986 0 0 2986
# 10: 4 2014-08-11 1 1 1 0 1 3017 3017 3017 3017 3017 3017
# 11: 5 2015-12-19 1 1 0 1 1 NA NA NA NA NA 0
If you don't like NA
s in your table you can make a little more job to fix it.
fillna <- function(x, fill = 0) {x[is.na(x)] <- fill; return(x)}
for (nm in bin_names.1) df[, eval(nm) := fillna(eval(parse(text = nm)))]
df[]
# id date A B C D E A.1 B.1 C.1 D.1 E.1 acumulativediff
# 1: 1 2015-01-15 0 1 0 0 1 0 0 0 0 0 0
# 2: 2 2004-03-01 1 0 1 0 1 0 0 0 0 0 0
# 3: 2 2017-03-15 1 1 0 0 1 4762 0 4762 0 4762 4762
# 4: 3 2000-01-15 0 0 0 1 0 0 0 0 0 0 0
# 5: 4 2006-05-08 1 1 0 1 0 0 0 0 0 0 0
# 6: 4 2008-05-09 1 0 1 1 0 732 732 0 732 0 732
# 7: 4 2014-05-11 0 0 1 1 0 2925 0 2925 2925 0 2925
# 8: 4 2014-06-11 0 0 1 0 0 0 0 2956 2956 0 2956
# 9: 4 2014-07-11 1 1 1 1 1 0 0 2986 0 0 2986
# 10: 4 2014-08-11 1 1 1 0 1 3017 3017 3017 3017 3017 3017
# 11: 5 2015-12-19 1 1 0 1 1 0 0 0 0 0 0
Another option is to use shift
with fill = 0
parameter to have zeros immediately.
shift(as.numeric(eval(parse(text = bin_names[i]))), fill = 0)*acumulativediff
Just noticed you actually want operations grouped by ID, in this case my answer doesn't provide the correct result.
For loops aren't always inherently slower -- iterating by row is expensive, but iterating by column should cause too much overhead, the only way to vectorize it fully would be to use matrix methods.
This should perform as well or similarly to most of the one-liners, but future-you might appreciate the readability.
setDT(df)
Suffix <- ".1"
SuffixedNames <- intersect(names(df),paste0(names(df),Suffix))
RawNames <- intersect(names(df),gsub(Suffix,"",SuffixedNames))
for (x in seq_along(RawNames)){
thisRawName <- RawNames[[x]]
thisSuffixedName <- SuffixedNames[[x]]
Raw <- df[[thisRawName]]
## Using the shift() function from the data.table package
Lagged <- shift(Raw, n = 1L, type = "lag", fill = -1L)
## Using set() from the data.table package
set(df, j = thisSuffixedName, value = ifelse((Raw == Lagged & Raw == 1L & Lagged == 1L) | (Raw == 0L & Lagged == 1L),
df[["acumulativediff"]],
0L))
}
In base R
:
df2 <- df
# first we ignore id
df2[-1,8:12] <- df[-nrow(df),3:7] * df[-1,13]
# then we make sure rows of 1st id are 0
df2[which(diff(as.numeric(df$id))==1)+1,8:12] <- 0
# id date A B C D E A.1 B.1 C.1 D.1 E.1 acumulativediff
# 1 1 2015-01-15 0 1 0 0 1 0 0 0 0 0 0
# 2 2 2004-03-01 1 0 1 0 1 0 0 0 0 0 0
# 3 2 2017-03-15 1 1 0 0 1 4762 0 4762 0 4762 4762
# 4 3 2000-01-15 0 0 0 1 0 0 0 0 0 0 0
# 5 4 2006-05-08 1 1 0 1 0 0 0 0 0 0 0
# 6 4 2008-05-09 1 0 1 1 0 732 732 0 732 0 732
# 7 4 2014-05-11 0 0 1 1 0 2925 0 2925 2925 0 2925
# 8 4 2014-06-11 0 0 1 0 0 0 0 2956 2956 0 2956
# 9 4 2014-07-11 1 1 1 1 1 0 0 2986 0 0 2986
# 10 4 2014-08-11 1 1 1 0 1 3017 3017 3017 3017 3017 3017
# 11 5 2015-12-19 1 1 0 1 1 0 0 0 0 0 0
Here's a benchmark comparing to @MKR's current solution, on given dataset and on simulated dataset of ~100 k rows. Mine's around 5 times faster on my machine in any case.
mm <- function(df){
df[-1,8:12] <- df[-nrow(df),3:7] * df[-1,13]
df[which(diff(as.numeric(df$id))==1)+1,8:12] <- 0
df}
mkr <- function(df){df %>% select(-ends_with(".1")) %>%
mutate_at(vars(A:E),
funs(l = ifelse(lag(id)==id & lag(., default=0) == "1",acumulativediff,0)))}
microbenchmark::microbenchmark(mm(df),mkr(df),unit="relative")
# Unit: relative
# expr min lq mean median uq max neval
# mm(df) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 100
# mkr(df) 7.788748 7.666287 5.265091 6.755467 6.655934 1.291942 100
big <- do.call(rbind,replicate(10000,df,F))
big$id <- data.table::rleid(big$id)
microbenchmark::microbenchmark(mm(big),mkr(big),unit="relative")
# Unit: relative
# expr min lq mean median uq max neval
# mm(big) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 100
# mkr(big) 7.065627 4.945323 4.429752 4.910065 4.566391 1.765609 100
Does this work for you?
df <-data.frame(id=c("1","2","2","3","4","4", "4", "4", "4", "4", "5"),
date = c("2015-01-15", "2004-03-01", "2017-03-15", "2000-01-15", "2006-05-08",
"2008-05-09", "2014-05-11", "2014-06-11", "2014-07-11", "2014-08-11", "2015-12-19"),
A =c (0,1,1,0,1,1,0,0,1,1,1), B=c(1,0,1,0,1,0,0,0,1,1,1), C = c(0,1,0,0,0,1,1,1,1,1,0),
D = c(0,0,0,1,1,1,1,0,1,0,1), E = c(1,1,1,0,0,0,0,0,1,1,1), A.1 = c(0,0,0,0,0,0,0,0,0,0,0),
B.1 = c(0,0,0,0,0,0,0,0,0,0,0), C.1 = c(0,0,0,0,0,0,0,0,0,0,0), D.1 = c(0,0,0,0,0,0,0,0,0,0,0),
E.1 = c(0,0,0,0,0,0,0,0,0,0,0), acumulativediff = c(0, 0, 4762, 0, 0, 732, 2925, 2956, 2986, 3017, 0),
stringsAsFactors = FALSE)
df2 <- df
0), D.1 = c(0, 0, 0, 0, 0, 732, 2925, 2956, 0, 3017, 0),E.1 = c(0, 0, 4762, 0, 0, 0, 0, 0, 0, 3017, 0), acumulativediff = c(0, 0, 4762, 0, 0, 732, 2925, 2956, 2986, 3017, 0)), .Names = c("id","date", "A", "B", "C", "D", "E", "A.1", "B.1", "C.1", "D.1", "E.1", "acumulativediff"), row.names = c(NA,-11L), class = "data.frame")
names <- colnames(df[3:7])
names2 <- colnames(df[8:12])
diff <- which(colnames(df)=="acumulativediff")
df2[,names2] <- ifelse(df[,1] == dplyr::lag(df[,1]) & df[,names] == "1" &
dplyr::lag(df[,names]) == "1",
df[,diff],
ifelse (df[,1] == dplyr::lag(df[,1]) & df[,names] == "0" &
dplyr::lag(df[,names]) == "1", df[,diff], 0))
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.