简体   繁体   中英

Excel or R: reshaping data

I have some data that looks like:

ID Data

1  a b c 
2  a b c a b c
3  a b c 
4  a b c a b c a b c a b c
5  a b c a b c a b c

I would like it in the following

ID Data

1 a b c
2 a b c
2 a b c
3 a b c
4 a b c
4 a b c
4 a b c
4 a b c
5 a b c
5 a b c
5 a b c

a , b and c are in different column cells so Data is actually many columns. I can concatenate them if need be.

Essentially a b and c are related but for some IDs I have multiple results and I'd like the data in a long format rather than wide but keeping the ID for each row.

I can do in this is R if that's easier too.

Dput:

structure(list(ID = c(9999812L, 999908L, 9993595L, 9992905L, 
9989664L, 9984487L, 9980956L, 9980112L, 9980091L, 9979915L, 9979613L, 
9979400L, 9978215L, 9976882L, 9975335L, 9974511L, 9973804L, 9973025L
), a = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "a", class = "factor"), 
    b = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "b", class = "factor"), 
    c = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "c", class = "factor"), 
    a.1 = structure(c(2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 
    1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L), .Label = c("", "a"), class = "factor"), 
    b.1 = structure(c(2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 
    1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L), .Label = c("", "b"), class = "factor"), 
    c.1 = structure(c(2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 
    1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L), .Label = c("", "c"), class = "factor"), 
    a.2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "a"), class = "factor"), 
    b.2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "b"), class = "factor"), 
    c.2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "c"), class = "factor"), 
    a.3 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "a"), class = "factor"), 
    b.3 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "b"), class = "factor"), 
    c.3 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "c"), class = "factor"), 
    a.4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), .Label = c("", "a"), class = "factor"), 
    b.4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), .Label = c("", "b"), class = "factor"), 
    c.4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), .Label = c("", "c"), class = "factor")), .Names = c("ID", 
"a", "b", "c", "a.1", "b.1", "c.1", "a.2", "b.2", "c.2", "a.3", 
"b.3", "c.3", "a.4", "b.4", "c.4"), class = "data.frame", row.names = c(NA, 
-18L))

With your initial data, you can use "stringr" and "reshape2" to melt the data.

dt <- data.frame(x = 1:5, y = c( "a b c" , "a b c a b c","a b c","a b c a b c a b c a b c", "a b c a b c a b c"))

library("stringr")
library("reshape2")
maxlen <- max(lengths(str_extract_all(dt$y,"(\\w)\\s(\\w)\\s(\\w)(\\1\\s\\2\\s\\3)*")))

list_lists <- str_extract_all(dt$y,"(\\w)\\s(\\w)\\s(\\w)(\\1\\s\\2\\s\\3)*")

li <- lapply(list_lists, `length<-`,maxlen)

dtnew <- data.frame(x =cbind(dt$x),do.call("rbind",li))
dtnew1 <- melt(dtnew,id.vars="x")
dtnew1 <- dtnew1[!is.na(dtnew1$value),]
dtnew1[order(dtnew1$x),]

   > dtnew1[order(dtnew1$x),c(1,3)]
   x value
1  1 a b c
2  2 a b c
7  2 a b c
3  3 a b c
4  4 a b c
9  4 a b c
14 4 a b c
19 4 a b c
5  5 a b c
10 5 a b c
15 5 a b c
> 

EDIT : For the updated data, make a field called " concat ", which is concatenated value of columns "a" to "c.4"

You can use : concat <- data.frame(concat=do.call("paste0",dt[,2:length(dt)])) to concatenate fields

then assign dt$concat <- concat

library("stringr")
library("reshape2")

maxlen <- max(lengths(str_extract_all(dt$concat,"(\\w)(\\w)(\\w)")))

list_lists <- str_extract_all(dt$concat,"(\\w)(\\w)(\\w)")

li <- lapply(list_lists, `length<-`,maxlen)

dtnew <- data.frame(x =cbind(dt$ID),y=do.call("rbind",li))
dtnew1 <- melt(dtnew,id.vars="x")
dtnew1 <- dtnew1[!is.na(dtnew1$value),]
dtnew1[order(dtnew1$x),c(1,3)]

> dtnew1[order(dtnew1$x),c(1,3)]
         x value
2   999908   abc
18 9973025   abc
36 9973025   abc
54 9973025   abc
72 9973025   abc
90 9973025   abc
17 9973804   abc
35 9973804   abc
16 9974511   abc
15 9975335   abc
33 9975335   abc
51 9975335   abc
69 9975335   abc
14 9976882   abc
13 9978215   abc
12 9979400   abc
30 9979400   abc
48 9979400   abc
11 9979613   abc
10 9979915   abc
9  9980091   abc
8  9980112   abc
7  9980956   abc
6  9984487   abc
24 9984487   abc
5  9989664   abc
4  9992905   abc
3  9993595   abc
1  9999812   abc
19 9999812   abc
> 

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM