简体   繁体   中英

Reshaping the HRS data from wide to long format and creating a time variable

I have the following dataset (containing around 25 more variables other than Weight = W and Height = H), all spanning 10 years.

Currently it has the following form and no time index.

df <- structure(list(data = structure(1:4, .Label = c("Ind_1", "Ind_2", 
"Ind_3", "Ind_4"), class = "factor"), r1weight = c(56, 76, 87, 64
), r2weight = c(57, 75, 88, 66), r3weight = c(56, 76, 87, 65), r4weight = c(56L, 
73L, 85L, 63L), r5weight = c(55L, 77L, 84L, 65L), r1height = c(151L, 163L, 
173L, 153L), r2height = c(154L, 164L, NA, 154L), r3height = c(NA, 165L, NA, 
152L), r4height = c(153L, 162L, 172L, 154L), r5height = c(152,161,171,154)), class = 
"data.frame", row.names = c(NA, 
 -4L)) 

  data  r1w r2w r3w r4w r5w r1h r2h r3h r4h r5h
1 Ind_1  56  57  56  56  55 151 154  NA 153 152
2 Ind_2  76  75  76  73  77 163 164 165 162 161
3 Ind_3  87  88  87  85  84 173  NA  NA 172 171
4 Ind_4  64  66  65  63  65 153 154 152 154 154`

I need to add time variable and reshape to long format, hopefully getting something like this.

dflong <- structure(list(time = structure(1:20, .Label = c("1", "2", 
     "3", "4", "5", "1","2","3","4","5", "1","2","3","4","5","1","2","3","4","5"), 
     class = "factor"), Ind = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4), W = c(56,57,56,56,55,76,75,76,73,77,87,88,87,85,84,64,66,65,63,65),
     H = c(151,154,NA,153,152,163,164,165,162,161,173,NA,NA,172,171,153,154,152,154,154)), class = "data.frame", row.names = c(NA, -20L))

which looks

   time Ind  W   H
1     1   1 56 151
2     2   1 57 154
3     3   1 56  NA
4     4   1 56 153
5     5   1 55 152
6     1   2 76 163
7     2   2 75 164
8     3   2 76 165
9     4   2 73 162
10    5   2 77 161
11    1   3 87 173
12    2   3 88  NA
13    3   3 87  NA
14    4   3 85 172
15    5   3 84 171
16    1   4 64 153
17    2   4 66 154
18    3   4 65 152
19    4   4 63 154
20    5   4 65 154`

I tried to use the reshape2 -command and so far I've got:

library(reshape2)
dflong <- melt(df,id.vars = c("idhhpn",r1w-r10w, r1h-r10h (help writing compactly),
     time(needs help constructing) )`

I dont want to write "r1w, r2w, r3w", but more like r1weight-r10weight so I dont have to write all 10 time instances for all 25 variable.

So far I've got to this point

在此处输入图片说明

by using the follwing code

melt <- melt(setDT(HRSdata), measure = patterns("idhhpn", "srhlt", "highbp", "diabetes", "cancer", "lungev", "heartp", "strokev", "psychev", "arth", "obese", "agey", "marpart", "male", "black", "hispan", "logass", "logdebt", "atotal", "debt", "lths", "hsorged", "somehs", "scorAA", "bachelor", "graduate", "works62", "works65", "momagey", "dadagey", "dadalive", "momalive", "vigact3", "smokesn"), 
     value.name = c("idhhpn", "srhlt", "highbp", "diabetes", "cancer", "lungev", "heartp", "strokev", "psychev", "arth", "obese", "agey", "marpart", "male", "black", "hispan", "logass", "logdebt", "atotal", "debt", "lths", "hsorged", "somehs", "scorAA", "bachelor", "graduate", "works62", "works65", "momagey", "dadagey", "dadalive", "momalive", "vigact3", "smokesn"), 
     variable.name = "time")[, 
      idhhpn := as.integer(sub("\\D+", "", HRSdata))][order(idhhpn)][, .(time, idhhpn, srhlt, highbp, diabetes, cancer, lungev, heartp, strokev, psychev, arth, obese, agey, marpart, male, black, hispan, logass, logdebt, atotal, debt, lths, hsorged, somehs, scorAA, bachelor, graduate, works62, works65, momagey, dadagey, dadalive, momalive, vigact3, smokesn        )]          

A tidyverse approach using gather and spread would be

library(tidyverse)

df %>%
  gather(time, ind, -data) %>%
  separate(time, into = c("indName", "time")) %>%
  spread(indName, ind)


#    data time  H  W
#1  Ind_1   1 151 56
#2  Ind_1   2 154 57
#3  Ind_1   3  NA 56
#4  Ind_1   4 153 56
#5  Ind_1   5 152 55
#6  Ind_2   1 163 76
#7  Ind_2   2 164 75
#8  Ind_2   3 165 76
#9  Ind_2   4 162 73
#10 Ind_2   5 161 77
#11 Ind_3   1 173 87
#12 Ind_3   2  NA 88
#13 Ind_3   3  NA 87
#14 Ind_3   4 172 85
#15 Ind_3   5 171 84
#16 Ind_4   1 153 64
#17 Ind_4   2 154 66
#18 Ind_4   3 152 65
#19 Ind_4   4 154 63
#20 Ind_4   5 154 65

Same solution but with the revised variable names of "r[num][varname]" (by @iod):

df %>%
  gather(time, ind, -data) %>%
  mutate(time=gsub("r([0-9])","\\1_",time)) %>%
  separate(time, into = c("time","indName")) %>%
  spread(indName, ind)

    data time height weight
1  Ind_1    1    151     56
2  Ind_1    2    154     57
3  Ind_1    3     NA     56
4  Ind_1    4    153     56
5  Ind_1    5    152     55
6  Ind_2    1    163     76
7  Ind_2    2    164     75
8  Ind_2    3    165     76
9  Ind_2    4    162     73
10 Ind_2    5    161     77
11 Ind_3    1    173     87
12 Ind_3    2     NA     88
13 Ind_3    3     NA     87
14 Ind_3    4    172     85
15 Ind_3    5    171     84
16 Ind_4    1    153     64
17 Ind_4    2    154     66
18 Ind_4    3    152     65
19 Ind_4    4    154     63
20 Ind_4    5    154     65

You can use melt function from data.table and then cbind -

setDT(df)
df <- cbind(setnames(melt(df)[grep("^H_",variable),],"value","H"),
            setnames(melt(df)[grep("^W_",variable),],"value","W"))
df <- df[,Ind:=gsub(".*_","",data)] ##cleaning Ind_
df <- df[, time:=1:.N, by = .(Ind)]
df <- df[,.(time,W,H,Ind)]

Output-

> df
    time  W   H Ind
 1:    1 56 151   1
 2:    1 76 163   2
 3:    1 87 173   3
 4:    1 64 153   4
 5:    2 57 154   1
 6:    2 75 164   2
 7:    2 88  NA   3
 8:    2 66 154   4
 9:    3 56  NA   1
10:    3 76 165   2
11:    3 87  NA   3
12:    3 65 152   4
13:    4 56 153   1
14:    4 73 162   2
15:    4 85 172   3
16:    4 63 154   4
17:    5 55 152   1
18:    5 77 161   2
19:    5 84 171   3
20:    5 65 154   4

An option using data.table that makes use of the measure/patterns would be to use melt . In the example, the column names have common patterns as 'weight', 'height' which we specify it in measure parameter to convert it to 'long' format, then extract the numeric part with sub to create 'Ind'

library(data.table)
melt(setDT(df), measure = patterns("weight", "height"), value.name = c("W", "H"), 
  variable.name = "time")[, 
    Ind := as.integer(sub("\\D+", "", data))][order(Ind)][, .(time, Ind, W, H)]
#   time Ind  W   H
# 1:    1   1 56 151
# 2:    2   1 57 154
# 3:    3   1 56  NA
# 4:    4   1 56 153
# 5:    5   1 55 152
# 6:    1   2 76 163
# 7:    2   2 75 164
# 8:    3   2 76 165
# 9:    4   2 73 162
#10:    5   2 77 161
#11:    1   3 87 173
#12:    2   3 88  NA
#13:    3   3 87  NA
#14:    4   3 85 172
#15:    5   3 84 171
#16:    1   4 64 153
#17:    2   4 66 154
#18:    3   4 65 152
#19:    4   4 63 154
#20:    5   4 65 154

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM