简体   繁体   中英

R reshaping dataset from wide to long when columns don't represent all points in time

My dataset looks somewhat like this:

ID job01     age_started_job01 job02     age_started_job02    job03       age_started_job03
1  "waiter"  18                "lawyer"  25                   NA          NA
2  "plumber" 18                "builder" 20                   "foreman"   25

I'm trying to get to a format where I have this (note that I assume that people stayed in the same job as the previous year if they didn't start a new one that year):

ID job_age18     job19      job20      job21     job22     job23     job24     job25
1  "waiter"      "waiter"   "waiter"   "waiter"  "waiter"  "waiter"  "waiter"  "lawyer"
2  "plumber"     "plumber"  "builder"  "builder" "builder" "builder" "builder" "foreman"

And then convert it to long like this:

ID  job        age
1   "waiter"   18
1   "waiter"   19
1   "waiter"   20
1   "waiter"   21
1   "waiter"   22
1   "waiter"   23
1   "waiter"   24
1   "lawyer"   25
2   "plumber"  18     
2   "plumber"  19
2   "builder"  20
2   "builder"  21
2   "builder"  22
2   "builder"  23
2   "builder"  24
2   "foreman"  25

The step from wide to long can be done with reshape2 and I know how to do it, but I cannot get from the first dataset to the intermediate format. I have tried something like this (ugly and with loops):

#create job variables
for (age in 18:25 ){
      data[,paste("job_age",age, sep="")] <- NA
}


#populate each variable looping through job spells and ages
for (spell in c("01","02","03")){ 
  for (age in 18:25){
    #Place of employment
      data[,paste("job_age",age, sep="")] <- ifelse(
        data[,paste("age_started_job,spell,sep="")]==age &
        !is.na(data[,paste("age_started_job",spell,sep="")]),
        data[,paste("job",spell,sep="")],
        data[,paste("job_age",age, sep="")])
   }
}

Besides looking really ugly with loops and the like, it gives me an error saying that I provided 2 variables to replace 1 variables .

Is there a good, elegant way to do it, perhaps with reshape2 and plyr ?

You may try

df1 <- reshape(df, idvar='ID', timevar='time', 
      varying=list(c(2,4,6), c(3,5,7)), direction='long')
colnames(df1)[4] <- 'age'
d1 <- data.frame(age=18:25)
res <-   do.call(rbind,lapply(split(df1, df1$ID), function(x) {
                         x1 <- merge(x, d1, by='age', all=TRUE)
                    x1$job <-unique(na.omit(x1$job01))[cumsum(!is.na(x1$job01))]
                    x1$ID <- x1$ID[1]
                  na.omit(x1[,c(2,5,1)])}))

row.names(res) <- NULL
res
#   ID     job age
#1   1  waiter  18
#2   1  waiter  19
#3   1  waiter  20
#4   1  waiter  21
#5   1  waiter  22
#6   1  waiter  23
#7   1  waiter  24
#8   1  lawyer  25
#9   2 plumber  18
#10  2 plumber  19
#11  2 builder  20
#12  2 builder  21
#13  2 builder  22
#14  2 builder  23
#15  2 builder  24
#16  2 foreman  25

You could also try

 colnames(df1)[3:4] <- c('job', 'age')
 d1 <- expand.grid(ID=unique(df1$ID), age=18:25)

 library(dplyr)
 left_join(d1, df1[,-2], by=c('age', 'ID')) %>%
                         group_by(ID) %>% 
                         arrange(ID, age) %>% 
                         mutate(indx=cumsum(!is.na(job))) %>%
                         group_by(ID, indx) %>%
                         mutate(job= replace(job, is.na(job)&indx!=0,
                                                      na.omit(job))) %>%
                         select(-indx)

data

df <- structure(list(ID = 1:2, job01 = c("waiter", "plumber"), age_started_job01 =
c(18L, 18L), job02 = c("lawyer", "builder"), age_started_job02 = c(25L, 
20L), job03 = c(NA, "foreman"), age_started_job03 = c(NA, 25L)), .Names
 = c("ID", "job01", "age_started_job01", "job02", "age_started_job02", 
"job03", "age_started_job03"), class = "data.frame", row.names = c(NA, -2L))

Here's another approach that uses merged.stack + expandRows from my "splitstackshape" function. This answer uses @akrun's sample data:

DT <- na.omit(merged.stack(df, var.stubs = c("job", "age_started_job"),
                           sep = "var.stubs"))
DT[, age_started_job := as.numeric(age_started_job)]
DT[, Range := diff(c(age_started_job, (age_started_job[.N]+1))), by = list(ID)]
expandRows(DT, "Range")[, age_started_job := age_started_job + 
                          (sequence(.N)-1), by = list(ID, .time_1)][]
#     ID .time_1     job age_started_job
#  1:  1      01  waiter              18
#  2:  1      01  waiter              19
#  3:  1      01  waiter              20
#  4:  1      01  waiter              21
#  5:  1      01  waiter              22
#  6:  1      01  waiter              23
#  7:  1      01  waiter              24
#  8:  1      02  lawyer              25
#  9:  2      01 plumber              18
# 10:  2      01 plumber              19
# 11:  2      02 builder              20
# 12:  2      02 builder              21
# 13:  2      02 builder              22
# 14:  2      02 builder              23
# 15:  2      02 builder              24
# 16:  2      03 foreman              25

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM