简体   繁体   中英

Transforming long dataset to wide with a twist in R?

Here is a sample of my dataframe:

structure(list(user_id = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L), press_id = c(1L, 2L, 3L, 4L, 1L, 
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), press_acc_mag = c(23.6537093679718, 
23.0846851796042, 23.1615193166382, 23.1611737715988, 23.1693053238739, 
22.6002811355062, 22.6771152725402, 22.6767697275009, 23.2636642131927, 
22.6946400248251, 22.7714741618591, 22.7711286168198, 23.9096268064615, 
23.3406026180939, 23.4174367551279, 23.4170912100885), release_acc_mag = c(22.444559535822, 
22.3974678281557, 22.5370781850474, 22.3567189737439, 22.4517313344457, 
22.4046396267793, 22.544249983671, 22.3638907723676, 22.4494140705383, 
22.4023223628719, 22.5419327197636, 22.3615735084602, 22.4099252863741, 
22.3628335787077, 22.5024439355994, 22.322084724296), max_acc = c(23.7327911876129, 
23.8772090302736, 23.4628943410094, 23.4333331415454, 23.8558416360749, 
24.0002594787356, 23.5859447894714, 23.5563835900074, 23.5419580147544, 
23.686375857415, 23.2720611681509, 23.2424999686869, 24.0218455738806, 
24.1662634165412, 23.7519487272771, 23.7223875278131), min_acc = c(22.2863243700941, 
22.2026521966429, 22.1872267275715, 22.2835176932173, 22.2213018995416, 
22.1376297260904, 22.122204257019, 22.2184952226648, 22.247136081127, 
22.1634639076758, 22.1480384386044, 22.2443294042502, 22.2763940267469, 
22.1927218532957, 22.1772963842243, 22.2735873498701), avg_acc = c(22.9971077741663, 
22.8805170955298, 22.8162218339998, 22.8998658131877, 22.8198237605325, 
22.7032330818959, 22.6389378203659, 22.7225817995538, 22.9432723377064, 
22.8266816590698, 22.7623863975398, 22.8460303767277, 22.6997818029301, 
22.5831911242936, 22.5188958627636, 22.6025398419515), press_vel_ang_mag = c(2.99159368861775, 
3.69840841890355, 3.32010086461129, 3.59665109519773, 2.63983403890421, 
3.3346628319386, 2.95635527764634, 3.19667346535676, 3.18228873692629, 
3.88910346721209, 3.51079591291983, 3.78734614350627, 3.25113160166759, 
3.94596039470198, 3.56765284040972, 3.80797102812014), release_vel_ang_mag = c(3.92468401207319, 
4.03008319373796, 4.1763926713929, 3.97486508224465, 4.058922586184, 
4.16432176784877, 4.31063124550371, 4.13278517230109, 3.99069606133371, 
4.09609524299848, 4.24240472065342, 4.04087713150517, 4.02678472891242, 
4.13218391057718, 4.27849338823212, 4.1006473150295), max_vel_ang = c(4.1156188310448, 
4.35656997515235, 4.36732749036451, 4.27129525975646, 3.80582301429526, 
4.04677415840281, 4.05753167361497, 3.96149944300692, 3.65238824784895, 
3.8933393919565, 3.90409690716865, 3.8080646765606, 3.81641565599981, 
4.05736680010736, 4.06812431531952, 3.97209208471146), min_vel_ang = c(3.16836561463486, 
3.30917959689493, 3.4968727906284, 3.4259687986302, 2.94495943811947, 
3.08577342037954, 3.27346661411301, 3.20256262211481, 2.79156870664285, 
2.93238268890291, 3.12007588263639, 3.04917189063818, 3.17862422648419, 
3.31943820874426, 3.50713140247773, 3.43622741047952), avg_vel_ang = c(3.49530401281139, 
3.62360655957713, 3.71126066224395, 3.66119576608271, 3.31029178731189, 
3.43859433407763, 3.52624843674446, 3.47618354058321, 3.36223808478038, 
3.49054063154613, 3.57819473421295, 3.52812983805171, 3.47104738292443, 
3.59934992969018, 3.687004032357, 3.63693913619575), press_size = c(2.08045207961665, 
2.09221678961665, 2.08045207961665, 2.08437364761665, 2.07665439720744, 
2.08841910720744, 2.07665439720744, 2.08057596520744, 2.07421744576499, 
2.08598215576499, 2.07421744576499, 2.07813901376499, 2.07581316516794, 
2.08757787516794, 2.07581316516794, 2.07973473316794), release_size = c(2.08792744972158, 
2.08792744972158, 2.08400587572158, 2.08792744972158, 2.0914990670944, 
2.0914990670944, 2.0875774930944, 2.0914990670944, 2.09029239656691, 
2.09029239656691, 2.08637082256691, 2.09029239656691, 2.09468365117687, 
2.09468365117687, 2.09076207717687, 2.09468365117687), intra_digit_time = c(76072848, 
100906925, 65971232, 66473272, 76072850, 100906927, 65971234, 
66473274, 76072848, 100906925, 65971232, 66473272, 76072850, 
100906927, 65971234, 66473274), inter_digit_time = c(0, 229174998, 
324746190, 496211113, 0, 229174996, 324746188, 496211111, 0, 
229174998, 324746190, 496211113, 0, 229174996, 324746188, 496211111
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-16L), .Names = c("user_id", "press_id", "press_acc_mag", "release_acc_mag", 
"max_acc", "min_acc", "avg_acc", "press_vel_ang_mag", "release_vel_ang_mag", 
"max_vel_ang", "min_vel_ang", "avg_vel_ang", "press_size", "release_size", 
"intra_digit_time", "inter_digit_time"))

I want to transform it to be a 2 rows dataset like this (blank values should be with the appropriate numbers):

user_id press_acc_mag_press_id_1 press_acc_mag_press_id_2 press_acc_mag_press_id_3 press_acc_mag_press_id_4 release_acc_mag_press_id_1 release_acc_mag_press_id_2 release_acc_mag_press_id_3 release_acc_mag_press_id_4 ..... 
    1
    2

Please advise how can I transform it to such format? I am trying to use tidyr::gather and spread:

library(dplyr)
library(tidyr)
df %>% filter(replicate %in% c(1,2), user_id %in% c(1,2)) %>% ungroup() %>%
  select(user_id, 
         press_acc_mag, release_acc_mag, max_acc, min_acc, avg_acc,
         press_vel_ang_mag, release_vel_ang_mag, max_vel_ang, min_vel_ang, avg_vel_ang,
         press_size, release_size, intra_digit_time, inter_digit_time) %>% 
  gather(key, value)

but without luck, please advise.

NOTE: HERE I HAVE 4 ROWS PER USER ID, MEANING THAT I NEED X4 COLUMNS, SO EACH COLUMN SHOULD HAVE A PREFIX FROM EACH PRESS_ID (1-4)

Although the question has been tagged with dplyr and tidyr I want to point out for the sake of completeness that dcast() is able to reshape multiple value columns simultaneously to wide format. Therefore, the gather() step used in the other answers can be completely skipped.

In addition, the rowid() function is used to handle the multiple records per user_id in OP's sample dataset conveniently.

In addition, the original order of columns is preserved as shown in OP's expected output ( press_acc_mag_1 , press_acc_mag_2 , etc instead of avg_acc_1 , avg_acc_2 , etc)

library(data.table)   # version 1.11.4 used
dcast(setDT(df), user_id + rowid(sub_id <- press_id, user_id) ~ press_id, 
      value.var = setdiff(names(df), c("user_id", "press_id")))
  user_id sub_id press_acc_mag_1 press_acc_mag_2 press_acc_mag_3 press_acc_mag_4 1: 1 1 23.65371 23.08469 23.16152 23.16117 2: 1 2 23.26366 22.69464 22.77147 22.77113 3: 2 1 23.16931 22.60028 22.67712 22.67677 4: 2 2 23.90963 23.34060 23.41744 23.41709 release_acc_mag_1 release_acc_mag_2 release_acc_mag_3 release_acc_mag_4 max_acc_1 max_acc_2 1: 22.44456 22.39747 22.53708 22.35672 23.73279 23.87721 2: 22.44941 22.40232 22.54193 22.36157 23.54196 23.68638 3: 22.45173 22.40464 22.54425 22.36389 23.85584 24.00026 4: 22.40993 22.36283 22.50244 22.32208 24.02185 24.16626 max_acc_3 max_acc_4 min_acc_1 min_acc_2 min_acc_3 min_acc_4 avg_acc_1 avg_acc_2 avg_acc_3 1: 23.46289 23.43333 22.28632 22.20265 22.18723 22.28352 22.99711 22.88052 22.81622 2: 23.27206 23.24250 22.24714 22.16346 22.14804 22.24433 22.94327 22.82668 22.76239 3: 23.58594 23.55638 22.22130 22.13763 22.12220 22.21850 22.81982 22.70323 22.63894 4: 23.75195 23.72239 22.27639 22.19272 22.17730 22.27359 22.69978 22.58319 22.51890 ... ... 
 library(tidyverse)
 df[1:8,] %>% group_by(user_id) %>%
              gather(key,value,-user_id, -press_id) %>% mutate(new=paste0(key,'_',press_id)) %>% 
              select(-press_id,-key) %>% spread(new,value)

The basic idea is to gather the values first so the data set is in completely long format, then make a new variable which is the combination of the variable name and press_id, and then spread back.

In your example, you somehow have doubled each user id, so I've taken only the first 8 rows.

Note that you can optionally keep the order of the columns by converting the variable to a factor and keeping the original order.

df[1:8,] %>% gather("variable", "value", -c(user_id, press_id)) %>% 
  mutate(X=paste(variable, press_id, sep="_")) %>%
  select(-c(press_id, variable)) %>%
  mutate_at("X", forcats::fct_inorder) %>%
  spread(X, value)

## # A tibble: 2 x 57
##   user_id press_acc_mag_1 press_acc_mag_2 press_acc_mag_3 press_acc_mag_4 release_acc_mag_1 release_acc_mag_2
## *   <int>           <dbl>           <dbl>           <dbl>           <dbl>             <dbl>             <dbl>
## 1       1            23.7            23.1            23.2            23.2              22.4              22.4
## 2       2            23.2            22.6            22.7            22.7  

There are more columns, which I'm not showing here.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM