简体   繁体   中英

Melting data with several groups of column names in R

I am working some data with repeated measurements for multiple covariates. The sample data format is like this

set.seed(123)
df <- data.frame(id = 1001:1003, matrix(rnorm(36),3,12),
                 d=runif(3), e=runif(3), f=runif(3))
colnames(df) <- c('df', paste('a', 1:4, sep=''), paste('b', 1:4, sep=''),
                  paste('c',1:4,sep=''), 'd', 'e', 'f')
df

  id          a1         a2  a3  a4  b1  b2  b3 b4 c1 c2 c3 c4 d e f
1001  -0.5604756 0.07050839 ......
1002  -0.2301775 0.12928774
1003   1.5587083 1.71506499 ...

I would like my data transformed to be long format in each covariates like

  id  time          a          b        c        d       e      f
1001     1 -0.5604756  0.4007715 ..
1001     2 0.07050839
1001     3
1001     4
1002     1
1002     2
1002     3
1002     4
1003     1
1003     2
1003     3
1004     4

Thanks very much.

Here's a method using the recently released tidyr package. Note that I changed your example data slightly, do you need it to be exactly as you provided it?

Example data:

set.seed(123)
df = data.frame(id=1001:1003,matrix(rnorm(36),3,12),d=runif(3),e=runif(3),f=runif(3))
colnames(df) = c('df',paste('a',1:4,sep=''),
                      paste('b',1:4,sep=''),
                      paste('c',1:4,sep=''),
                 paste('d',1,sep=''),
                 paste('e',1,sep=''),
                 paste('f',1,sep=''))
df

Code:

library(tidyr)
library(dplyr)

df %>%
  gather(key, value, -df) %>%
  extract(key, c("letter", "number"), "([[:alpha:]])([[:digit:]])" ) %>%
  spread(letter, value)

Result:

 df number           a          b          c            d         e         f
1  1001      1 -0.56047565  0.4007715 -0.6250393 0.7101824014 0.2201189 0.3517979
2  1001      2  0.07050839  1.7869131  0.1533731           NA        NA        NA
3  1001      3  0.46091621  0.7013559  0.4264642           NA        NA        NA
4  1001      4 -0.44566197 -0.2179749  0.8781335           NA        NA        NA
5  1002      1 -0.23017749  0.1106827 -1.6866933 0.0006247733 0.3798165 0.1111354
6  1002      2  0.12928774  0.4978505 -1.1381369           NA        NA        NA
7  1002      3 -1.26506123 -0.4727914 -0.2950715           NA        NA        NA
8  1002      4  1.22408180 -1.0260044  0.8215811           NA        NA        NA
9  1003      1  1.55870831 -0.5558411  0.8377870 0.4753165741 0.6127710 0.2436195
10 1003      2  1.71506499 -1.9666172  1.2538149           NA        NA        NA
11 1003      3 -0.68685285 -1.0678237  0.8951257           NA        NA        NA
12 1003      4  0.35981383 -0.7288912  0.6886403           NA        NA        NA

data.table from v1.9.5+ can melt to multiple columns. You can install it by following the instructions here .

require(data.table) # v1.9.5+
melt(setDT(df), measure = patterns(paste0("^", letters[1:6], "[0-9]*$")), 
         value.name=letters[1:6])
#       df variable           a          b          c            d         e         f
#  1: 1001        1 -0.56047565  0.4007715 -0.6250393 0.7101824014 0.2201189 0.3517979
#  2: 1002        1 -0.23017749  0.1106827 -1.6866933 0.0006247733 0.3798165 0.1111354
#  3: 1003        1  1.55870831 -0.5558411  0.8377870 0.4753165741 0.6127710 0.2436195
#  4: 1001        2  0.07050839  1.7869131  0.1533731           NA        NA        NA
#  5: 1002        2  0.12928774  0.4978505 -1.1381369           NA        NA        NA
#  6: 1003        2  1.71506499 -1.9666172  1.2538149           NA        NA        NA
#  7: 1001        3  0.46091621  0.7013559  0.4264642           NA        NA        NA
#  8: 1002        3 -1.26506123 -0.4727914 -0.2950715           NA        NA        NA
#  9: 1003        3 -0.68685285 -1.0678237  0.8951257           NA        NA        NA
# 10: 1001        4 -0.44566197 -0.2179749  0.8781335           NA        NA        NA
# 11: 1002        4  1.22408180 -1.0260044  0.8215811           NA        NA        NA
# 12: 1003        4  0.35981383 -0.7288912  0.6886403           NA        NA        NA

Somewhat of a hack, but it works.

library(reshape)

# split the dataframes into a list of dataframes (one df for each letter)
dfs <- lapply(letters[1:6], function(x) 
  df[colnames(df)[grep(paste0("(df)|(",x,"\\d?)"), colnames(df))]])

# melt the data (using reshape's melt function)
dfs <- lapply(1:6, function(x)
  melt(dfs[[x]],id.vars="df",variable_name="time"))

# convert factor to numerical value for "time"; rename "value" to letter
for(i in 1:6) {
  dfs[[i]]$time <- as.numeric(dfs[[i]]$time)
  colnames(dfs[[i]])[3] <- letters[i]
}

# merge everything back into one dataframe
df.final <- dfs[[1]]
for(i in 2:6)
  df.final <- merge(x = df.final, y = dfs[[i]], all = TRUE, by = c("df","time"))

df.final

     df time           a          b          c            d         e         f
1  1001    1 -0.56047565  0.4007715 -0.6250393 0.7101824014 0.2201189 0.3517979
2  1001    2  0.07050839  1.7869131  0.1533731           NA        NA        NA
3  1001    3  0.46091621  0.7013559  0.4264642           NA        NA        NA
4  1001    4 -0.44566197 -0.2179749  0.8781335           NA        NA        NA
5  1002    1 -0.23017749  0.1106827 -1.6866933 0.0006247733 0.3798165 0.1111354
6  1002    2  0.12928774  0.4978505 -1.1381369           NA        NA        NA
7  1002    3 -1.26506123 -0.4727914 -0.2950715           NA        NA        NA
8  1002    4  1.22408180 -1.0260044  0.8215811           NA        NA        NA
9  1003    1  1.55870831 -0.5558411  0.8377870 0.4753165741 0.6127710 0.2436195
10 1003    2  1.71506499 -1.9666172  1.2538149           NA        NA        NA
11 1003    3 -0.68685285 -1.0678237  0.8951257           NA        NA        NA
12 1003    4  0.35981383 -0.7288912  0.6886403           NA        NA        NA

This would give you the long format of teh wide columns:

(res <-  reshape(df[-(14:16)], direction="long", idvar="df", 
                 sep="",  # the default is "." but you had no separator
                 varying=names(df)[-c(1,14:16)]) )
         df time           a          b          c
1001.1 1001    1 -0.56047565  0.4007715 -0.6250393
1002.1 1002    1 -0.23017749  0.1106827 -1.6866933
1003.1 1003    1  1.55870831 -0.5558411  0.8377870
1001.2 1001    2  0.07050839  1.7869131  0.1533731
1002.2 1002    2  0.12928774  0.4978505 -1.1381369
1003.2 1003    2  1.71506499 -1.9666172  1.2538149
1001.3 1001    3  0.46091621  0.7013559  0.4264642
1002.3 1002    3 -1.26506123 -0.4727914 -0.2950715
1003.3 1003    3 -0.68685285 -1.0678237  0.8951257
1001.4 1001    4 -0.44566197 -0.2179749  0.8781335
1002.4 1002    4  1.22408180 -1.0260044  0.8215811
1003.4 1003    4  0.35981383 -0.7288912  0.6886403

And you could merge that with the columns that are not repeated:

 merge(res, df[c(1,14:16)])
     df time           a          b          c            d
1  1001    1 -0.56047565  0.4007715 -0.6250393 0.7101824014
2  1001    4 -0.44566197 -0.2179749  0.8781335 0.7101824014
3  1001    3  0.46091621  0.7013559  0.4264642 0.7101824014
4  1001    2  0.07050839  1.7869131  0.1533731 0.7101824014
5  1002    2  0.12928774  0.4978505 -1.1381369 0.0006247733
6  1002    1 -0.23017749  0.1106827 -1.6866933 0.0006247733
7  1002    4  1.22408180 -1.0260044  0.8215811 0.0006247733
8  1002    3 -1.26506123 -0.4727914 -0.2950715 0.0006247733
9  1003    3 -0.68685285 -1.0678237  0.8951257 0.4753165741
10 1003    2  1.71506499 -1.9666172  1.2538149 0.4753165741
11 1003    1  1.55870831 -0.5558411  0.8377870 0.4753165741
12 1003    4  0.35981383 -0.7288912  0.6886403 0.4753165741
           e         f
1  0.2201189 0.3517979
2  0.2201189 0.3517979
3  0.2201189 0.3517979
4  0.2201189 0.3517979
5  0.3798165 0.1111354
6  0.3798165 0.1111354
7  0.3798165 0.1111354
8  0.3798165 0.1111354
9  0.6127710 0.2436195
10 0.6127710 0.2436195
11 0.6127710 0.2436195
12 0.6127710 0.2436195

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM