简体   繁体   中英

Fill blank rows depends on previous/next non-empty value

I have a subscription data frame looks like the following. There are about 1 million unique IDs. The table lists subscription status. When user began subscription service, status field is indicated by 'Sub' and when user unsubscribed, it's indicated by 'Usub'.

dat <- data.frame(ID=c(rep("A",12),(rep("B",12))), Year="2014", Month=rep(seq(1:12),2), Status=NA)
dat$Status[4] <- "Sub"
dat$Status[8] <- "Usub"
dat$Status[17] <- "Usub"
dat$Status[21] <- "Sub"

ID Year Month Status
A  2014   1    
A  2014   2    
A  2014   3    
A  2014   4    Sub
A  2014   5    
A  2014   6    
A  2014   7    
A  2014   8    Usub 
A  2014   9    
A  2014  10    
A  2014  11    
A  2014  12    
B  2014   1    
B  2014   2    
B  2014   3    
B  2014   4    
B  2014   5    Usub    
B  2014   6    
B  2014   7    
B  2014   8    
B  2014   9    Sub
B  2014  10    
B  2014  11    
B  2014  12    
C  2014   1     .
.    .    .     .
.    .    .     .

I am looking to fill in the gap between each status updates. The desired output table would look like the following:

ID Year Month Status
A  2014   1    Usub
A  2014   2    Usub
A  2014   3    Usub
A  2014   4    Sub
A  2014   5    Sub
A  2014   6    Sub
A  2014   7    Sub
A  2014   8    Usub
A  2014   9    Usub
A  2014  10    Usub
A  2014  11    Usub
A  2014  12    Usub
B  2014   1    Sub
B  2014   2    Sub
B  2014   3    Sub
B  2014   4    Sub
B  2014   5    Usub
B  2014   6    Usub
B  2014   7    Usub
B  2014   8    Usub
B  2014   9    Sub
B  2014  10    Sub
B  2014  11    Sub
B  2014  12    Sub
C  2014   1     .
.    .    .     .
.    .    .     .

Each ID has at least one status value. If an ID's first status record is "Usub", then all prior months' status is "Sub". (Like ID B on 2014/05) On the contrary, if the first status record start with "Sub", all prior months' status is "Usub"

You can generate an alternating sequence which is equivalent to your desired Status column by subtracting Status == "Usub" and Status = "Sub" in which way all positions that should be filled with Sub will have a lower value than those that should be filled with Usub and then use the fact that factor can be labelled in an ordered way to transform the integer sequence to a factor:

library(dplyr)
df %>% group_by(ID) %>% mutate(Status = factor(cumsum((Status == "Usub") - (Status == "Sub")), 
                                               labels = c("Sub", "Usub")))

#    ID Year Month Status
# 1   A 2014     1   Usub
# 2   A 2014     2   Usub
# 3   A 2014     3   Usub
# 4   A 2014     4    Sub
# 5   A 2014     5    Sub
# 6   A 2014     6    Sub
# 7   A 2014     7    Sub
# 8   A 2014     8   Usub
# 9   A 2014     9   Usub
# 10  A 2014    10   Usub
# 11  A 2014    11   Usub
# 12  A 2014    12   Usub
# 13  B 2014     1    Sub
# 14  B 2014     2    Sub
# 15  B 2014     3    Sub
# 16  B 2014     4    Sub
# 17  B 2014     5   Usub
# 18  B 2014     6   Usub
# 19  B 2014     7   Usub
# 20  B 2014     8   Usub
# 21  B 2014     9    Sub
# 22  B 2014    10    Sub
# 23  B 2014    11    Sub
# 24  B 2014    12    Sub

A corresponding data.table way would be:

library(data.table)
setDT(df)[, Status := as.character(factor(cumsum((Status == "Usub") - (Status == "Sub")), labels = c("Sub", "Usub"))), .(ID)]

where you have to convert the new factor back to character class since it doesn't allow the type change when creating a new column in place.

The data assumes you have empty string instead of NA :

structure(list(ID = c("A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "B", "B", "B", "B", "B", "B", "B", "B", "B", 
"B", "B", "B"), Year = c("2014", "2014", "2014", "2014", "2014", 
"2014", "2014", "2014", "2014", "2014", "2014", "2014", "2014", 
"2014", "2014", "2014", "2014", "2014", "2014", "2014", "2014", 
"2014", "2014", "2014"), Month = c("1", "2", "3", "4", "5", "6", 
"7", "8", "9", "10", "11", "12", "1", "2", "3", "4", "5", "6", 
"7", "8", "9", "10", "11", "12"), Status = c("", "", "", "Sub", 
"", "", "", "Usub", "", "", "", "", "", "", "", "", "Usub", "", 
"", "", "Sub", "", "", "")), .Names = c("ID", "Year", "Month", 
"Status"), row.names = c(NA, 24L), class = "data.frame")
uniquevector<-unique(dat$ID)
for(i in uniquevector){
  zzz <- which(dat$ID==i & dat$Status == "Sub")
  zzz2 <- which(dat$ID==i & dat$Status == "Usub")
  zzz3 <- which(dat$ID==i & dat$Month == 12)
  zzz4 <- which(dat$ID==i & dat$Month == 1)
  if(zzz2 > zzz){  
  index<-zzz:(zzz2-1)
  dat$Status[index] <- "Sub"
}
  if(zzz2 < zzz){  
    index<-zzz2:(zzz-1)
    dat$Status[index] <- "Usub"
  }
  if(zzz3 > zzz2 & zzz < zzz2){  
    index<-zzz2:zzz3
    dat$Status[index] <- "Usub"
  }
if(zzz2 < zzz & zzz3 > zzz){  
  index<-zzz:zzz3
  dat$Status[index] <- "Sub"
if((zzz4 < zzz) & zzz < zzz2){  
    index<-zzz4:(zzz-1)
    dat$Status[index] <- "Usub"
}
  if((zzz4 < zzz2) & zzz2 < zzz){  
    index<-zzz4:(zzz2-1)
    dat$Status[index] <- "Sub"
  } 

  }}

Another option would be to convert the blank "" to NA and make use of na.locf from zoo package to replace the NA with the non-NA previous element. As this is a group by operation, we can also do this with ave from base R .

library(zoo)
df$Status <- with(df, ave(replace(Status, !nzchar(Status), NA), ID,
           FUN = function(x){ x1 <- na.locf(x, na.rm=FALSE)
         replace(x1, is.na(x1), setdiff(unique(na.omit(x1)), x1[!is.na(x1)][1]))}))
df$Status
#[1] "Usub" "Usub" "Usub" "Sub"  "Sub"  "Sub"  "Sub"  "Usub" "Usub" "Usub" "Usub" "Usub" "Sub"  "Sub"  "Sub"  "Sub"  "Usub" "Usub" "Usub"
#[20] "Usub" "Sub"  "Sub"  "Sub"  "Sub" 

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM