[英]separate a string separated by ; into columns in R
我正在嘗試使用 dplyr 將一列分成多列
這是專欄:
name
1 tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=4;exon_name=id8;exon_rank=1
2 tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=13;exon_name=id17;exon_rank=10
3 tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=14;exon_name=id18;exon_rank=11
4 tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=15;exon_name=id19;exon_rank=12
5 tx_id=8;tx_name=XM_017919249.1;gene_id=LOC108560513;exon_id=70;exon_name=id25;exon_rank=1
6 tx_id=8,9;tx_name=XM_017919249.1,XM_017918469.1;gene_id=LOC108560513;exon_id=70,71;exon_name=id25,id20;exon_rank=1;zero_length_insertion=True
dput(x) [使可重現]
structure(list(name = structure(c(4L, 1L, 2L, 3L, 6L, 5L), .Label = c("tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=13;exon_name=id17;exon_rank=10",
"tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=14;exon_name=id18;exon_rank=11",
"tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=15;exon_name=id19;exon_rank=12",
"tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=4;exon_name=id8;exon_rank=1",
"tx_id=8,9;tx_name=XM_017919249.1,XM_017918469.1;gene_id=LOC108560513;exon_id=70,71;exon_name=id25,id20;exon_rank=1;zero_length_insertion=True",
"tx_id=8;tx_name=XM_017919249.1;gene_id=LOC108560513;exon_id=70;exon_name=id25;exon_rank=1"
), class = "factor")), class = "data.frame", row.names = c(NA,
-6L))
我只想獲得 1 的 exon_rank 並將其變成列我想做的是將其變成以下內容
tx_id tx_name gene_id exon_id exon_name exon_rank
1 1 XM_017916188.1 LOC108556273 3 id1 1
2 7 XM_017913854.1 LOC108557084 61 id6 1
3 2 XM_017927872.1 LOC108564750 4 id8 1
我一直在嘗試使用 x %>% separate()
但它會卡在 tx_id=8,9 vs tx_id=1 的情況下
有什么幫助嗎? 謝謝你
這是tidyverse
一個選項
library(dplyr)
library(tidyr)
library(data.table)
x %>%
mutate(name = as.character(name)) %>%
separate_rows(name, sep=";") %>%
separate(name, into = c('key', 'value'), sep="=") %>%
mutate(rn = rowid(key)) %>%
pivot_wider(names_from = key, values_from = value) %>%
type.convert(as.is = TRUE) %>%
filter(exon_rank == 1)
您可以編寫一個讀取該類型數據的函數:
read_data <- function(data){
read_data_row <- function(x){
u <- read.dcf(textConnection(x))
v <- read.csv(text=u, row.names = colnames(u), header=FALSE, na.strings = "")
tidyr::fill(data.frame(t(v), row.names = NULL), dplyr::all_of(colnames(u)),
.direction = 'downup')
}
plyr::rbind.fill(sapply(strsplit(gsub('=',':',data),';'), read_data_row))
}
現在讀取數據:
new_data <- read_data(x$name)
new_data
tx_id tx_name gene_id exon_id exon_name exon_rank zero_length_insertion
1 2 XM_017927872.1 LOC108564750 4 id8 1 <NA>
2 2 XM_017927872.1 LOC108564750 13 id17 10 <NA>
3 2 XM_017927872.1 LOC108564750 14 id18 11 <NA>
4 2 XM_017927872.1 LOC108564750 15 id19 12 <NA>
5 8 XM_017919249.1 LOC108560513 70 id25 1 <NA>
6 8 XM_017919249.1 LOC108560513 70 id25 1 True
7 9 XM_017918469.1 LOC108560513 71 id20 1 True
您可以按照自己的方式對其進行子集化:
subset(new_data, exon_rank==1)
tx_id tx_name gene_id exon_id exon_name exon_rank zero_length_insertion
1 2 XM_017927872.1 LOC108564750 4 id8 1 <NA>
5 8 XM_017919249.1 LOC108560513 70 id25 1 <NA>
6 8 XM_017919249.1 LOC108560513 70 id25 1 True
7 9 XM_017918469.1 LOC108560513 71 id20 1 True
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.