简体   繁体   中英

Is there a regular expression to direct columns to specific rows when using pivot_wider() in R?

The first three rows of my real data look like this;

# Real data example
fullname<-c("Argemone glauca", "Bacopa monnieri", "Brighamia insignis")
Mean.Germ.coef_0<-c(0.31, 0.768015267, 0.555758514)
Mean.Germ.coef_10<-c(0.119356725, 0.624444444, 0.479357585)
Mean.Germ.coef_20<-c(0.01, 0.202431661, 0.01)
Mean.Germ.coef_35<-c(0.01, 0.021111111, 0.01)
sd.germ.coef_0<-c(0.055079106, 0.148040638, 0.199485791)
sd.germ.coef_10<-c(0.15341342, 0.079546759, 0.068405754)
sd.germ.coef_20<-c(0, 0.059160256, 0)
sd.germ.coef_35<-c(0, 0.022308189, 0)
n_0<-c(5, 5, 5)
n_10<-c(5, 5, 5)    
n_20<-c(5, 5, 5)    
n_35<-c(5, 5, 5)
LRR10<-c(-0.954455598, -0.206947247, -0.147887029)
LRR_var10<-c(0.336731047, 0.010676627, 0.029840885)
LRR20<-c(-3.433987204, -1.333407261, -4.017748779)
LRR_var20<-c(0.006313648, 0.024512868, 0.025768057)
LRR35<-c(-3.433987204, -3.594010117, -4.017748779)
LRR_var35<-c(0.006313648, 0.230755613, 0.025768057)
df<-data.frame(fullname, Mean.Germ.coef_0, Mean.Germ.coef_10, Mean.Germ.coef_20, Mean.Germ.coef_35,
     sd.germ.coef_0, sd.germ.coef_10, sd.germ.coef_20, sd.germ.coef_35,
     LRR10, LRR_var10, LRR20, LRR_var20, LRR35, LRR_var35)

I need a long data.frame (or tibble) where the specific columns go into specific rows like this;

# Example output
fullname<-c(rep("Argemone glauca", 4), rep("Bacopa monnieri", 4), rep("Brighamia insignis", 4))
Treat<-rep(c(0, 10, 20, 35), 3)
Mean.Germ.coef<-c(0.31, 0.768015267, 0.555758514, 0.01, 0.202431661, 0.01,
                  0.01, 0.202431661, 0.01, 0.01, 0.021111111, 0.01)
sd.germ.coef<-c(0.055079106, 0.148040638, 0.199485791, 0.15341342, 0.079546759, 0.068405754,
               0, 0.059160256, 0, 0, 0.022308189, 0)
n<-rep(5, 12)
LRR<-c("NA", -0.954455598, -0.206947247, -0.147887029, 
       "NA", -3.433987204, -1.333407261, -4.017748779, 
       "NA", -3.433987204, -3.594010117, -4.017748779)
LRR_var<-c("NA", 0.336731047, 0.010676627, 0.029840885, 
           "NA", 0.006313648, 0.024512868, 0.025768057,
           "NA", 0.006313648, 0.230755613, 0.02576805)
output<-data.frame(fullname, Treat, Mean.Germ.coef, sd.germ.coef,
                   n, LRR, LRR_var)

I am trying to use tidyr::pivot_longer() but I think my problem is lacking an understanding of regular expressions.

We may use pivot_longer with names_pattern to capture the substring in column names ie the second capture group includes only the digits ( \\d+ ) at the end ( $ ) of the string. Some columns have _ before it and some doesn't. So, we use [_\\D] before the capture group to remove that and capture rest of the characters preceding as first group ( (.*)

library(dplyr)
library(tidyr)
df %>%
    pivot_longer(cols = -fullname, names_to = c(".value", "Treat"), 
         names_pattern = "(.*)[_\\D](\\d+)$")

-output

# A tibble: 12 × 7
   fullname           Treat Mean.Germ.coef sd.germ.coef     LR   LRR_va     n
   <chr>              <chr>          <dbl>        <dbl>  <dbl>    <dbl> <dbl>
 1 Argemone glauca    0             0.31         0.0551 NA     NA           5
 2 Argemone glauca    10            0.119        0.153  -0.954  0.337       5
 3 Argemone glauca    20            0.01         0      -3.43   0.00631     5
 4 Argemone glauca    35            0.01         0      -3.43   0.00631     5
 5 Bacopa monnieri    0             0.768        0.148  NA     NA           5
 6 Bacopa monnieri    10            0.624        0.0795 -0.207  0.0107      5
 7 Bacopa monnieri    20            0.202        0.0592 -1.33   0.0245      5
 8 Bacopa monnieri    35            0.0211       0.0223 -3.59   0.231       5
 9 Brighamia insignis 0             0.556        0.199  NA     NA           5
10 Brighamia insignis 10            0.479        0.0684 -0.148  0.0298      5
11 Brighamia insignis 20            0.01         0      -4.02   0.0258      5
12 Brighamia insignis 35            0.01         0      -4.02   0.0258      5

Or another option with names_sep

library(stringr)
df %>%
    pivot_longer(cols = -fullname, names_to = c(".value", "Treat"), 
          names_sep = "(?<=\\D)(?=\\d+$)") %>%
     rename_with(~ str_remove(.x, "_"))

-output

# A tibble: 12 × 7
   fullname           Treat Mean.Germ.coef sd.germ.coef    LRR   LRRvar     n
   <chr>              <chr>          <dbl>        <dbl>  <dbl>    <dbl> <dbl>
 1 Argemone glauca    0             0.31         0.0551 NA     NA           5
 2 Argemone glauca    10            0.119        0.153  -0.954  0.337       5
 3 Argemone glauca    20            0.01         0      -3.43   0.00631     5
 4 Argemone glauca    35            0.01         0      -3.43   0.00631     5
 5 Bacopa monnieri    0             0.768        0.148  NA     NA           5
 6 Bacopa monnieri    10            0.624        0.0795 -0.207  0.0107      5
 7 Bacopa monnieri    20            0.202        0.0592 -1.33   0.0245      5
 8 Bacopa monnieri    35            0.0211       0.0223 -3.59   0.231       5
 9 Brighamia insignis 0             0.556        0.199  NA     NA           5
10 Brighamia insignis 10            0.479        0.0684 -0.148  0.0298      5
11 Brighamia insignis 20            0.01         0      -4.02   0.0258      5
12 Brighamia insignis 35            0.01         0      -4.02   0.0258      5

data

df <- structure(list(fullname = c("Argemone glauca", "Bacopa monnieri", 
"Brighamia insignis"), Mean.Germ.coef_0 = c(0.31, 0.768015267, 
0.555758514), Mean.Germ.coef_10 = c(0.119356725, 0.624444444, 
0.479357585), Mean.Germ.coef_20 = c(0.01, 0.202431661, 0.01), 
    Mean.Germ.coef_35 = c(0.01, 0.021111111, 0.01), 
sd.germ.coef_0 = c(0.055079106, 
    0.148040638, 0.199485791), sd.germ.coef_10 = c(0.15341342, 
    0.079546759, 0.068405754), sd.germ.coef_20 = c(0, 0.059160256, 
    0), sd.germ.coef_35 = c(0, 0.022308189, 0), LRR10 = c(-0.954455598, 
    -0.206947247, -0.147887029), 
LRR_var10 = c(0.336731047, 0.010676627, 
    0.029840885), LRR20 = c(-3.433987204, -1.333407261, -4.017748779
    ), LRR_var20 = c(0.006313648, 0.024512868, 0.025768057), 
    LRR35 = c(-3.433987204, -3.594010117, -4.017748779),
 LRR_var35 = c(0.006313648, 
    0.230755613, 0.025768057), n_0 = c(5, 5, 5), n_10 = c(5, 
    5, 5), n_20 = c(5, 5, 5), n_35 = c(5, 5, 5)), 
class = "data.frame", row.names = c(NA, 
-3L))

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM