简体   繁体   中英

separate a string separated by ; into columns in R

I am trying to use dplyr to separate a column into multiple columns

here is the column:

name
1                                                       tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=4;exon_name=id8;exon_rank=1
2                                                    tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=13;exon_name=id17;exon_rank=10
3                                                    tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=14;exon_name=id18;exon_rank=11
4                                                    tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=15;exon_name=id19;exon_rank=12
5                                                     tx_id=8;tx_name=XM_017919249.1;gene_id=LOC108560513;exon_id=70;exon_name=id25;exon_rank=1
6 tx_id=8,9;tx_name=XM_017919249.1,XM_017918469.1;gene_id=LOC108560513;exon_id=70,71;exon_name=id25,id20;exon_rank=1;zero_length_insertion=True

dput(x) [makes reproducible]

structure(list(name = structure(c(4L, 1L, 2L, 3L, 6L, 5L), .Label = c("tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=13;exon_name=id17;exon_rank=10", 
"tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=14;exon_name=id18;exon_rank=11", 
"tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=15;exon_name=id19;exon_rank=12", 
"tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=4;exon_name=id8;exon_rank=1", 
"tx_id=8,9;tx_name=XM_017919249.1,XM_017918469.1;gene_id=LOC108560513;exon_id=70,71;exon_name=id25,id20;exon_rank=1;zero_length_insertion=True", 
"tx_id=8;tx_name=XM_017919249.1;gene_id=LOC108560513;exon_id=70;exon_name=id25;exon_rank=1"
), class = "factor")), class = "data.frame", row.names = c(NA, 
-6L))

I want to get only exon_rank of 1 and have it turned into columns What I would like to do is turn it into the following

 tx_id        tx_name      gene_id exon_id exon_name exon_rank
1     1 XM_017916188.1 LOC108556273       3       id1         1
2     7 XM_017913854.1 LOC108557084      61       id6         1
3     2 XM_017927872.1 LOC108564750       4       id8         1

I've been trying to use x %>% separate()

but it gets stuck in situations where tx_id=8,9 vs tx_id=1

any help? thank you

Here is an option with tidyverse

library(dplyr)
library(tidyr)
library(data.table)
x %>%
    mutate(name = as.character(name)) %>% 
    separate_rows(name, sep=";") %>% 
    separate(name, into = c('key', 'value'), sep="=") %>% 
    mutate(rn = rowid(key)) %>% 
    pivot_wider(names_from = key, values_from = value) %>% 
    type.convert(as.is = TRUE) %>% 
    filter(exon_rank == 1)

You could write a function that reads that type of data:

read_data <- function(data){
  read_data_row <- function(x){
    u <- read.dcf(textConnection(x))
    v <- read.csv(text=u, row.names = colnames(u), header=FALSE, na.strings = "")
    tidyr::fill(data.frame(t(v), row.names = NULL), dplyr::all_of(colnames(u)),
                .direction = 'downup')
  }
  plyr::rbind.fill(sapply(strsplit(gsub('=',':',data),';'), read_data_row))
}

Now read the data:

new_data <- read_data(x$name)
new_data
  tx_id        tx_name      gene_id exon_id exon_name exon_rank zero_length_insertion
1     2 XM_017927872.1 LOC108564750       4       id8         1                  <NA>
2     2 XM_017927872.1 LOC108564750      13      id17        10                  <NA>
3     2 XM_017927872.1 LOC108564750      14      id18        11                  <NA>
4     2 XM_017927872.1 LOC108564750      15      id19        12                  <NA>
5     8 XM_017919249.1 LOC108560513      70      id25         1                  <NA>
6     8 XM_017919249.1 LOC108560513      70      id25         1                  True
7     9 XM_017918469.1 LOC108560513      71      id20         1                  True
  

You can subset it the way you want:

subset(new_data, exon_rank==1)
  tx_id        tx_name      gene_id exon_id exon_name exon_rank zero_length_insertion
1     2 XM_017927872.1 LOC108564750       4       id8         1                  <NA>
5     8 XM_017919249.1 LOC108560513      70      id25         1                  <NA>
6     8 XM_017919249.1 LOC108560513      70      id25         1                  True
7     9 XM_017918469.1 LOC108560513      71      id20         1                  True 

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM