Add new columns and insert values in columns based on value in another column

Question

I have a R dataframe data1 as below:

prodID   storeID   Term    Exit
1        1001      5       0
1        1002      4       1
1        1003      3       1
1        1004      5       0
2        1001      4       1
2        1002      3       1
2        1003      5       0
3        1001      4       1
3        1002      3       1
3        1003      5       0
4        1001      4       1
4        1002      3       1
5        1001      5       0
5        1002      4       1
5        1003      3       1

This is of course highly simplified format of my real data which runs into around 3 million rows. I have to do the following:

Based on max value in Term column, insert that many columns in data1 with NA values. Column names should be Week1 , Week2 , Week3 , etc
For each row, fill the new columns with 0, 1 or NA using these rules: 1)If Term is 5, then insert 0 in Week1 , Week2 , upto Week4 and 1 in Week5 2) If Term is 4 then insert 0 in Week1 , Week2 , and Week3 , 1 in Week4 and keep NA in Week5 . And so on....

The final output should look like:

prodID   storeID   Term    Exit  Week1   Week2   Week3   Week4   Week5
1        1001      5       0     0       0       0       0       1
1        1002      4       1     0       0       0       1       NA
1        1003      3       1     0       0       1       NA      NA
1        1004      5       0     0       0       0       0       1
2        1001      4       1     0       0       0       1       NA
2        1002      3       1     0       0       1       NA      NA
2        1003      5       0     0       0       0       0       1
3        1001      4       1     0       0       0       1       NA
3        1002      3       1     0       0       1       NA      NA
3        1003      5       0     0       0       0       0       1
4        1001      4       1     0       0       0       1       NA
4        1002      3       1     0       0       1       NA      NA
5        1001      5       0     0       0       0       0       1
5        1002      4       1     0       0       0       1       NA
5        1003      3       1     0       0       1       NA      NA

This is what I tried:

variant <- c("Week1","Week2","Week3","Week4","Week5")

data1[variant] <- NA

for (i in 1:length(data1$prodID)){
  data1$Week1 <- ifelse(data1$Term==1,1,0)
  data1$Week2 <- ifelse(data1$Term==2,1,0)
  data1$Week3 <- ifelse(data1$Term==3,1,0)
  data1$Week4 <- ifelse(data1$Term==4,1,0)
  data1$Week5 <- ifelse(data1$Term==5,1,0)
}

This doesn't help me populate NA in the required cells. I would like to retain the NA values because I am going to do a wide to long data transformation on the data frame later on. And I know the above approach is not feasible in my huge dataset. Any suggestions are most welcome.

Answer 1

Here is one idea. We can create the content you need and then split the columns.

library(dplyr)
library(data.table)
library(splitstackshape)

dat2 <- dat %>%
  mutate(Week = case_when(
    Term == 5       ~"0,0,0,0,1",
    Term == 4       ~"0,0,0,1,NA",
    Term == 3       ~"0,0,1,NA,NA",
    Term == 2       ~"0,1,NA,NA,NA",
    Term == 1       ~"1,NA,NA,NA,NA"
  )) %>%
  cSplit(splitCols = "Week")
dat2
#     prodID storeID Term Exit Week_1 Week_2 Week_3 Week_4 Week_5
#  1:      1    1001    5    0      0      0      0      0      1
#  2:      1    1002    4    1      0      0      0      1     NA
#  3:      1    1003    3    1      0      0      1     NA     NA
#  4:      1    1004    5    0      0      0      0      0      1
#  5:      2    1001    4    1      0      0      0      1     NA
#  6:      2    1002    3    1      0      0      1     NA     NA
#  7:      2    1003    5    0      0      0      0      0      1
#  8:      3    1001    4    1      0      0      0      1     NA
#  9:      3    1002    3    1      0      0      1     NA     NA
# 10:      3    1003    5    0      0      0      0      0      1
# 11:      4    1001    4    1      0      0      0      1     NA
# 12:      4    1002    3    1      0      0      1     NA     NA
# 13:      5    1001    5    0      0      0      0      0      1
# 14:      5    1002    4    1      0      0      0      1     NA
# 15:      5    1003    3    1      0      0      1     NA     NA

Or use this tidyverse method. I like this one better than my previous one because this method does not require manually typing the week values.

library(dplyr)
library(tidyr)
library(purrr)

dat2 <- dat %>%
  mutate(Week = map2(1, Term, `:`)) %>%
  unnest() %>%
  group_by(prodID, Term) %>%
  mutate(Week_Value = as.integer(Week == max(Week)),
         Week = paste0("Week", Week)) %>%
  spread(Week, Week_Value) %>%
  ungroup()
dat2
# # A tibble: 15 x 9
#    prodID storeID  Term  Exit Week1 Week2 Week3 Week4 Week5
#     <int>   <int> <int> <int> <int> <int> <int> <int> <int>
#  1      1    1001     5     0     0     0     0     0     1
#  2      1    1002     4     1     0     0     0     1    NA
#  3      1    1003     3     1     0     0     1    NA    NA
#  4      1    1004     5     0     0     0     0     0     1
#  5      2    1001     4     1     0     0     0     1    NA
#  6      2    1002     3     1     0     0     1    NA    NA
#  7      2    1003     5     0     0     0     0     0     1
#  8      3    1001     4     1     0     0     0     1    NA
#  9      3    1002     3     1     0     0     1    NA    NA
# 10      3    1003     5     0     0     0     0     0     1
# 11      4    1001     4     1     0     0     0     1    NA
# 12      4    1002     3     1     0     0     1    NA    NA
# 13      5    1001     5     0     0     0     0     0     1
# 14      5    1002     4     1     0     0     0     1    NA
# 15      5    1003     3     1     0     0     1    NA    NA

UPDATE

We can use str_pad from the stringr package to pad 0 before spread the week column to sort the column name.

library(tidyverse)

dat2 <- dat %>%
  mutate(Week = map2(1, Term, `:`)) %>%
  unnest() %>%
  group_by(prodID, Term) %>%
  mutate(Week_Value = as.integer(Week == max(Week)),
         Week = paste0("Week", str_pad(Week, width = 3, pad = "0"))) %>%
  spread(Week, Week_Value) %>%
  ungroup()
dat2
# # A tibble: 15 x 9
#   prodID storeID  Term  Exit Week001 Week002 Week003 Week004 Week005
#     <int>   <int> <int> <int>   <int>   <int>   <int>   <int>   <int>
#  1      1    1001     5     0       0       0       0       0       1
#  2      1    1002     4     1       0       0       0       1      NA
#  3      1    1003     3     1       0       0       1      NA      NA
#  4      1    1004     5     0       0       0       0       0       1
#  5      2    1001     4     1       0       0       0       1      NA
#  6      2    1002     3     1       0       0       1      NA      NA
#  7      2    1003     5     0       0       0       0       0       1
#  8      3    1001     4     1       0       0       0       1      NA
#  9      3    1002     3     1       0       0       1      NA      NA
# 10      3    1003     5     0       0       0       0       0       1
# 11      4    1001     4     1       0       0       0       1      NA
# 12      4    1002     3     1       0       0       1      NA      NA
# 13      5    1001     5     0       0       0       0       0       1
# 14      5    1002     4     1       0       0       0       1      NA
# 15      5    1003     3     1       0       0       1      NA      NA

DATA

dat <- read.table(text = "prodID   storeID   Term    Exit
1        1001      5       0
                  1        1002      4       1
                  1        1003      3       1
                  1        1004      5       0
                  2        1001      4       1
                  2        1002      3       1
                  2        1003      5       0
                  3        1001      4       1
                  3        1002      3       1
                  3        1003      5       0
                  4        1001      4       1
                  4        1002      3       1
                  5        1001      5       0
                  5        1002      4       1
                  5        1003      3       1",
                  header = TRUE)

Answer 2

Here is one option with base R where we loop through the 'Term', tabulate to get a 0s and 1 for each element, append NA at the end with length<- and rbind the list elements to create the columns of interest

dat[paste0("Week", 1:5)] <- do.call(rbind, lapply(dat$Term,
                  function(x) `length<-`(tabulate(x), max(dat$Term))))
dat
#   prodID storeID Term Exit Week1 Week2 Week3 Week4 Week5
#1       1    1001    5    0     0     0     0     0     1
#2       1    1002    4    1     0     0     0     1    NA
#3       1    1003    3    1     0     0     1    NA    NA
#4       1    1004    5    0     0     0     0     0     1
#5       2    1001    4    1     0     0     0     1    NA
#6       2    1002    3    1     0     0     1    NA    NA
#7       2    1003    5    0     0     0     0     0     1
#8       3    1001    4    1     0     0     0     1    NA
#9       3    1002    3    1     0     0     1    NA    NA
#10      3    1003    5    0     0     0     0     0     1
#11      4    1001    4    1     0     0     0     1    NA
#12      4    1002    3    1     0     0     1    NA    NA
#13      5    1001    5    0     0     0     0     0     1
#14      5    1002    4    1     0     0     0     1    NA
#15      5    1003    3    1     0     0     1    NA    NA

Or using the similar approach with tidyverse

library(tidyverse)
dat %>% 
  mutate(Week = map(Term, ~ 
                            tabulate(.x) %>% 
                            as.list %>% 
                            set_names(paste0("Week", seq_along(.))) %>% 
                            as_tibble)) %>% 
  unnest 
#   prodID storeID Term Exit Week1 Week2 Week3 Week4 Week5
#1       1    1001    5    0     0     0     0     0     1
#2       1    1002    4    1     0     0     0     1    NA
#3       1    1003    3    1     0     0     1    NA    NA
#4       1    1004    5    0     0     0     0     0     1
#5       2    1001    4    1     0     0     0     1    NA
#6       2    1002    3    1     0     0     1    NA    NA
#7       2    1003    5    0     0     0     0     0     1
#8       3    1001    4    1     0     0     0     1    NA
#9       3    1002    3    1     0     0     1    NA    NA
#10      3    1003    5    0     0     0     0     0     1
#11      4    1001    4    1     0     0     0     1    NA
#12      4    1002    3    1     0     0     1    NA    NA
#13      5    1001    5    0     0     0     0     0     1
#14      5    1002    4    1     0     0     0     1    NA
#15      5    1003    3    1     0     0     1    NA    NA

Answer 3

An option using dplyr::mutate_at and case_when can be based on finding subscript integer in column name using quo_name(quo(.)) and then checking if column number is more/equal/less than value of Term .

# First add additional columns based on maximum value of Term
df[,paste("Week", 1:max(df$Term), sep="")] <- NA

library(dplyr)

df %>% mutate_at(vars(starts_with("Week")), funs(case_when(
  as.integer(sub(".*(\\d+)","\\1",quo_name(quo(.)))) < Term ~ 0L,
  as.integer(sub(".*(\\d+)","\\1",quo_name(quo(.)))) == Term ~ 1L,
  TRUE                                                      ~ NA_integer_
)))

#    prodID storeID Term Exit Week1 Week2 Week3 Week4 Week5
# 1       1    1001    5    0     0     0     0     0     1
# 2       1    1002    4    1     0     0     0     1    NA
# 3       1    1003    3    1     0     0     1    NA    NA
# 4       1    1004    5    0     0     0     0     0     1
# 5       2    1001    4    1     0     0     0     1    NA
# 6       2    1002    3    1     0     0     1    NA    NA
# 7       2    1003    5    0     0     0     0     0     1
# 8       3    1001    4    1     0     0     0     1    NA
# 9       3    1002    3    1     0     0     1    NA    NA
# 10      3    1003    5    0     0     0     0     0     1
# 11      4    1001    4    1     0     0     0     1    NA
# 12      4    1002    3    1     0     0     1    NA    NA
# 13      5    1001    5    0     0     0     0     0     1
# 14      5    1002    4    1     0     0     0     1    NA
# 15      5    1003    3    1     0     0     1    NA    NA

Data:

df <- read.table(text="
prodID   storeID   Term    Exit
1        1001      5       0
1        1002      4       1
1        1003      3       1
1        1004      5       0
2        1001      4       1
2        1002      3       1
2        1003      5       0
3        1001      4       1
3        1002      3       1
3        1003      5       0
4        1001      4       1
4        1002      3       1
5        1001      5       0
5        1002      4       1
5        1003      3       1",
header = TRUE, stringsAsFactors = FALSE)

Add new columns and insert values in columns based on value in another column

Question

3 answers

solution1
3 ACCPTED 2018-06-25 18:09:07

solution2
2 2018-06-25 18:27:35

solution3
2 2018-06-25 18:31:20

Add new columns and insert values in columns based on value in another column

Question

3 answers

solution1 3 ACCPTED 2018-06-25 18:09:07

solution2 2 2018-06-25 18:27:35

solution3 2 2018-06-25 18:31:20

solution1
3 ACCPTED 2018-06-25 18:09:07

solution2
2 2018-06-25 18:27:35

solution3
2 2018-06-25 18:31:20