I am trying to use dplyr to separate a column into multiple columns
here is the column:
name
1 tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=4;exon_name=id8;exon_rank=1
2 tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=13;exon_name=id17;exon_rank=10
3 tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=14;exon_name=id18;exon_rank=11
4 tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=15;exon_name=id19;exon_rank=12
5 tx_id=8;tx_name=XM_017919249.1;gene_id=LOC108560513;exon_id=70;exon_name=id25;exon_rank=1
6 tx_id=8,9;tx_name=XM_017919249.1,XM_017918469.1;gene_id=LOC108560513;exon_id=70,71;exon_name=id25,id20;exon_rank=1;zero_length_insertion=True
dput(x) [makes reproducible]
structure(list(name = structure(c(4L, 1L, 2L, 3L, 6L, 5L), .Label = c("tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=13;exon_name=id17;exon_rank=10",
"tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=14;exon_name=id18;exon_rank=11",
"tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=15;exon_name=id19;exon_rank=12",
"tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=4;exon_name=id8;exon_rank=1",
"tx_id=8,9;tx_name=XM_017919249.1,XM_017918469.1;gene_id=LOC108560513;exon_id=70,71;exon_name=id25,id20;exon_rank=1;zero_length_insertion=True",
"tx_id=8;tx_name=XM_017919249.1;gene_id=LOC108560513;exon_id=70;exon_name=id25;exon_rank=1"
), class = "factor")), class = "data.frame", row.names = c(NA,
-6L))
I want to get only exon_rank of 1 and have it turned into columns What I would like to do is turn it into the following
tx_id tx_name gene_id exon_id exon_name exon_rank
1 1 XM_017916188.1 LOC108556273 3 id1 1
2 7 XM_017913854.1 LOC108557084 61 id6 1
3 2 XM_017927872.1 LOC108564750 4 id8 1
I've been trying to use x %>% separate()
but it gets stuck in situations where tx_id=8,9 vs tx_id=1
any help? thank you
Here is an option with tidyverse
library(dplyr)
library(tidyr)
library(data.table)
x %>%
mutate(name = as.character(name)) %>%
separate_rows(name, sep=";") %>%
separate(name, into = c('key', 'value'), sep="=") %>%
mutate(rn = rowid(key)) %>%
pivot_wider(names_from = key, values_from = value) %>%
type.convert(as.is = TRUE) %>%
filter(exon_rank == 1)
You could write a function that reads that type of data:
read_data <- function(data){
read_data_row <- function(x){
u <- read.dcf(textConnection(x))
v <- read.csv(text=u, row.names = colnames(u), header=FALSE, na.strings = "")
tidyr::fill(data.frame(t(v), row.names = NULL), dplyr::all_of(colnames(u)),
.direction = 'downup')
}
plyr::rbind.fill(sapply(strsplit(gsub('=',':',data),';'), read_data_row))
}
Now read the data:
new_data <- read_data(x$name)
new_data
tx_id tx_name gene_id exon_id exon_name exon_rank zero_length_insertion
1 2 XM_017927872.1 LOC108564750 4 id8 1 <NA>
2 2 XM_017927872.1 LOC108564750 13 id17 10 <NA>
3 2 XM_017927872.1 LOC108564750 14 id18 11 <NA>
4 2 XM_017927872.1 LOC108564750 15 id19 12 <NA>
5 8 XM_017919249.1 LOC108560513 70 id25 1 <NA>
6 8 XM_017919249.1 LOC108560513 70 id25 1 True
7 9 XM_017918469.1 LOC108560513 71 id20 1 True
You can subset it the way you want:
subset(new_data, exon_rank==1)
tx_id tx_name gene_id exon_id exon_name exon_rank zero_length_insertion
1 2 XM_017927872.1 LOC108564750 4 id8 1 <NA>
5 8 XM_017919249.1 LOC108560513 70 id25 1 <NA>
6 8 XM_017919249.1 LOC108560513 70 id25 1 True
7 9 XM_017918469.1 LOC108560513 71 id20 1 True
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.