简体   繁体   中英

Add new columns and insert values in columns based on value in another column

I have a R dataframe data1 as below:

prodID   storeID   Term    Exit
1        1001      5       0
1        1002      4       1
1        1003      3       1
1        1004      5       0
2        1001      4       1
2        1002      3       1
2        1003      5       0
3        1001      4       1
3        1002      3       1
3        1003      5       0
4        1001      4       1
4        1002      3       1
5        1001      5       0
5        1002      4       1
5        1003      3       1

This is of course highly simplified format of my real data which runs into around 3 million rows. I have to do the following:

  1. Based on max value in Term column, insert that many columns in data1 with NA values. Column names should be Week1 , Week2 , Week3 , etc
  2. For each row, fill the new columns with 0, 1 or NA using these rules: 1)If Term is 5, then insert 0 in Week1 , Week2 , upto Week4 and 1 in Week5 2) If Term is 4 then insert 0 in Week1 , Week2 , and Week3 , 1 in Week4 and keep NA in Week5 . And so on....

The final output should look like:

prodID   storeID   Term    Exit  Week1   Week2   Week3   Week4   Week5
1        1001      5       0     0       0       0       0       1
1        1002      4       1     0       0       0       1       NA
1        1003      3       1     0       0       1       NA      NA
1        1004      5       0     0       0       0       0       1
2        1001      4       1     0       0       0       1       NA
2        1002      3       1     0       0       1       NA      NA
2        1003      5       0     0       0       0       0       1
3        1001      4       1     0       0       0       1       NA
3        1002      3       1     0       0       1       NA      NA
3        1003      5       0     0       0       0       0       1
4        1001      4       1     0       0       0       1       NA
4        1002      3       1     0       0       1       NA      NA
5        1001      5       0     0       0       0       0       1
5        1002      4       1     0       0       0       1       NA
5        1003      3       1     0       0       1       NA      NA

This is what I tried:

variant <- c("Week1","Week2","Week3","Week4","Week5")

data1[variant] <- NA

for (i in 1:length(data1$prodID)){
  data1$Week1 <- ifelse(data1$Term==1,1,0)
  data1$Week2 <- ifelse(data1$Term==2,1,0)
  data1$Week3 <- ifelse(data1$Term==3,1,0)
  data1$Week4 <- ifelse(data1$Term==4,1,0)
  data1$Week5 <- ifelse(data1$Term==5,1,0)
}

This doesn't help me populate NA in the required cells. I would like to retain the NA values because I am going to do a wide to long data transformation on the data frame later on. And I know the above approach is not feasible in my huge dataset. Any suggestions are most welcome.

Here is one idea. We can create the content you need and then split the columns.

library(dplyr)
library(data.table)
library(splitstackshape)

dat2 <- dat %>%
  mutate(Week = case_when(
    Term == 5       ~"0,0,0,0,1",
    Term == 4       ~"0,0,0,1,NA",
    Term == 3       ~"0,0,1,NA,NA",
    Term == 2       ~"0,1,NA,NA,NA",
    Term == 1       ~"1,NA,NA,NA,NA"
  )) %>%
  cSplit(splitCols = "Week")
dat2
#     prodID storeID Term Exit Week_1 Week_2 Week_3 Week_4 Week_5
#  1:      1    1001    5    0      0      0      0      0      1
#  2:      1    1002    4    1      0      0      0      1     NA
#  3:      1    1003    3    1      0      0      1     NA     NA
#  4:      1    1004    5    0      0      0      0      0      1
#  5:      2    1001    4    1      0      0      0      1     NA
#  6:      2    1002    3    1      0      0      1     NA     NA
#  7:      2    1003    5    0      0      0      0      0      1
#  8:      3    1001    4    1      0      0      0      1     NA
#  9:      3    1002    3    1      0      0      1     NA     NA
# 10:      3    1003    5    0      0      0      0      0      1
# 11:      4    1001    4    1      0      0      0      1     NA
# 12:      4    1002    3    1      0      0      1     NA     NA
# 13:      5    1001    5    0      0      0      0      0      1
# 14:      5    1002    4    1      0      0      0      1     NA
# 15:      5    1003    3    1      0      0      1     NA     NA

Or use this tidyverse method. I like this one better than my previous one because this method does not require manually typing the week values.

library(dplyr)
library(tidyr)
library(purrr)

dat2 <- dat %>%
  mutate(Week = map2(1, Term, `:`)) %>%
  unnest() %>%
  group_by(prodID, Term) %>%
  mutate(Week_Value = as.integer(Week == max(Week)),
         Week = paste0("Week", Week)) %>%
  spread(Week, Week_Value) %>%
  ungroup()
dat2
# # A tibble: 15 x 9
#    prodID storeID  Term  Exit Week1 Week2 Week3 Week4 Week5
#     <int>   <int> <int> <int> <int> <int> <int> <int> <int>
#  1      1    1001     5     0     0     0     0     0     1
#  2      1    1002     4     1     0     0     0     1    NA
#  3      1    1003     3     1     0     0     1    NA    NA
#  4      1    1004     5     0     0     0     0     0     1
#  5      2    1001     4     1     0     0     0     1    NA
#  6      2    1002     3     1     0     0     1    NA    NA
#  7      2    1003     5     0     0     0     0     0     1
#  8      3    1001     4     1     0     0     0     1    NA
#  9      3    1002     3     1     0     0     1    NA    NA
# 10      3    1003     5     0     0     0     0     0     1
# 11      4    1001     4     1     0     0     0     1    NA
# 12      4    1002     3     1     0     0     1    NA    NA
# 13      5    1001     5     0     0     0     0     0     1
# 14      5    1002     4     1     0     0     0     1    NA
# 15      5    1003     3     1     0     0     1    NA    NA

UPDATE

We can use str_pad from the stringr package to pad 0 before spread the week column to sort the column name.

library(tidyverse)

dat2 <- dat %>%
  mutate(Week = map2(1, Term, `:`)) %>%
  unnest() %>%
  group_by(prodID, Term) %>%
  mutate(Week_Value = as.integer(Week == max(Week)),
         Week = paste0("Week", str_pad(Week, width = 3, pad = "0"))) %>%
  spread(Week, Week_Value) %>%
  ungroup()
dat2
# # A tibble: 15 x 9
#   prodID storeID  Term  Exit Week001 Week002 Week003 Week004 Week005
#     <int>   <int> <int> <int>   <int>   <int>   <int>   <int>   <int>
#  1      1    1001     5     0       0       0       0       0       1
#  2      1    1002     4     1       0       0       0       1      NA
#  3      1    1003     3     1       0       0       1      NA      NA
#  4      1    1004     5     0       0       0       0       0       1
#  5      2    1001     4     1       0       0       0       1      NA
#  6      2    1002     3     1       0       0       1      NA      NA
#  7      2    1003     5     0       0       0       0       0       1
#  8      3    1001     4     1       0       0       0       1      NA
#  9      3    1002     3     1       0       0       1      NA      NA
# 10      3    1003     5     0       0       0       0       0       1
# 11      4    1001     4     1       0       0       0       1      NA
# 12      4    1002     3     1       0       0       1      NA      NA
# 13      5    1001     5     0       0       0       0       0       1
# 14      5    1002     4     1       0       0       0       1      NA
# 15      5    1003     3     1       0       0       1      NA      NA

DATA

dat <- read.table(text = "prodID   storeID   Term    Exit
1        1001      5       0
                  1        1002      4       1
                  1        1003      3       1
                  1        1004      5       0
                  2        1001      4       1
                  2        1002      3       1
                  2        1003      5       0
                  3        1001      4       1
                  3        1002      3       1
                  3        1003      5       0
                  4        1001      4       1
                  4        1002      3       1
                  5        1001      5       0
                  5        1002      4       1
                  5        1003      3       1",
                  header = TRUE)

Here is one option with base R where we loop through the 'Term', tabulate to get a 0s and 1 for each element, append NA at the end with length<- and rbind the list elements to create the columns of interest

dat[paste0("Week", 1:5)] <- do.call(rbind, lapply(dat$Term,
                  function(x) `length<-`(tabulate(x), max(dat$Term))))
dat
#   prodID storeID Term Exit Week1 Week2 Week3 Week4 Week5
#1       1    1001    5    0     0     0     0     0     1
#2       1    1002    4    1     0     0     0     1    NA
#3       1    1003    3    1     0     0     1    NA    NA
#4       1    1004    5    0     0     0     0     0     1
#5       2    1001    4    1     0     0     0     1    NA
#6       2    1002    3    1     0     0     1    NA    NA
#7       2    1003    5    0     0     0     0     0     1
#8       3    1001    4    1     0     0     0     1    NA
#9       3    1002    3    1     0     0     1    NA    NA
#10      3    1003    5    0     0     0     0     0     1
#11      4    1001    4    1     0     0     0     1    NA
#12      4    1002    3    1     0     0     1    NA    NA
#13      5    1001    5    0     0     0     0     0     1
#14      5    1002    4    1     0     0     0     1    NA
#15      5    1003    3    1     0     0     1    NA    NA

Or using the similar approach with tidyverse

library(tidyverse)
dat %>% 
  mutate(Week = map(Term, ~ 
                            tabulate(.x) %>% 
                            as.list %>% 
                            set_names(paste0("Week", seq_along(.))) %>% 
                            as_tibble)) %>% 
  unnest 
#   prodID storeID Term Exit Week1 Week2 Week3 Week4 Week5
#1       1    1001    5    0     0     0     0     0     1
#2       1    1002    4    1     0     0     0     1    NA
#3       1    1003    3    1     0     0     1    NA    NA
#4       1    1004    5    0     0     0     0     0     1
#5       2    1001    4    1     0     0     0     1    NA
#6       2    1002    3    1     0     0     1    NA    NA
#7       2    1003    5    0     0     0     0     0     1
#8       3    1001    4    1     0     0     0     1    NA
#9       3    1002    3    1     0     0     1    NA    NA
#10      3    1003    5    0     0     0     0     0     1
#11      4    1001    4    1     0     0     0     1    NA
#12      4    1002    3    1     0     0     1    NA    NA
#13      5    1001    5    0     0     0     0     0     1
#14      5    1002    4    1     0     0     0     1    NA
#15      5    1003    3    1     0     0     1    NA    NA

An option using dplyr::mutate_at and case_when can be based on finding subscript integer in column name using quo_name(quo(.)) and then checking if column number is more/equal/less than value of Term .

# First add additional columns based on maximum value of Term
df[,paste("Week", 1:max(df$Term), sep="")] <- NA

library(dplyr)

df %>% mutate_at(vars(starts_with("Week")), funs(case_when(
  as.integer(sub(".*(\\d+)","\\1",quo_name(quo(.)))) < Term ~ 0L,
  as.integer(sub(".*(\\d+)","\\1",quo_name(quo(.)))) == Term ~ 1L,
  TRUE                                                      ~ NA_integer_
)))

#    prodID storeID Term Exit Week1 Week2 Week3 Week4 Week5
# 1       1    1001    5    0     0     0     0     0     1
# 2       1    1002    4    1     0     0     0     1    NA
# 3       1    1003    3    1     0     0     1    NA    NA
# 4       1    1004    5    0     0     0     0     0     1
# 5       2    1001    4    1     0     0     0     1    NA
# 6       2    1002    3    1     0     0     1    NA    NA
# 7       2    1003    5    0     0     0     0     0     1
# 8       3    1001    4    1     0     0     0     1    NA
# 9       3    1002    3    1     0     0     1    NA    NA
# 10      3    1003    5    0     0     0     0     0     1
# 11      4    1001    4    1     0     0     0     1    NA
# 12      4    1002    3    1     0     0     1    NA    NA
# 13      5    1001    5    0     0     0     0     0     1
# 14      5    1002    4    1     0     0     0     1    NA
# 15      5    1003    3    1     0     0     1    NA    NA

Data:

df <- read.table(text="
prodID   storeID   Term    Exit
1        1001      5       0
1        1002      4       1
1        1003      3       1
1        1004      5       0
2        1001      4       1
2        1002      3       1
2        1003      5       0
3        1001      4       1
3        1002      3       1
3        1003      5       0
4        1001      4       1
4        1002      3       1
5        1001      5       0
5        1002      4       1
5        1003      3       1",
header = TRUE, stringsAsFactors = FALSE)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM