簡體   English   中英

分隔由 ; 分隔的字符串 R中的列

[英]separate a string separated by ; into columns in R

我正在嘗試使用 dplyr 將一列分成多列

這是專欄:

name
1                                                       tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=4;exon_name=id8;exon_rank=1
2                                                    tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=13;exon_name=id17;exon_rank=10
3                                                    tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=14;exon_name=id18;exon_rank=11
4                                                    tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=15;exon_name=id19;exon_rank=12
5                                                     tx_id=8;tx_name=XM_017919249.1;gene_id=LOC108560513;exon_id=70;exon_name=id25;exon_rank=1
6 tx_id=8,9;tx_name=XM_017919249.1,XM_017918469.1;gene_id=LOC108560513;exon_id=70,71;exon_name=id25,id20;exon_rank=1;zero_length_insertion=True

dput(x) [使可重現]

structure(list(name = structure(c(4L, 1L, 2L, 3L, 6L, 5L), .Label = c("tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=13;exon_name=id17;exon_rank=10", 
"tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=14;exon_name=id18;exon_rank=11", 
"tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=15;exon_name=id19;exon_rank=12", 
"tx_id=2;tx_name=XM_017927872.1;gene_id=LOC108564750;exon_id=4;exon_name=id8;exon_rank=1", 
"tx_id=8,9;tx_name=XM_017919249.1,XM_017918469.1;gene_id=LOC108560513;exon_id=70,71;exon_name=id25,id20;exon_rank=1;zero_length_insertion=True", 
"tx_id=8;tx_name=XM_017919249.1;gene_id=LOC108560513;exon_id=70;exon_name=id25;exon_rank=1"
), class = "factor")), class = "data.frame", row.names = c(NA, 
-6L))

我只想獲得 1 的 exon_rank 並將其變成列我想做的是將其變成以下內容

 tx_id        tx_name      gene_id exon_id exon_name exon_rank
1     1 XM_017916188.1 LOC108556273       3       id1         1
2     7 XM_017913854.1 LOC108557084      61       id6         1
3     2 XM_017927872.1 LOC108564750       4       id8         1

我一直在嘗試使用 x %>% separate()

但它會卡在 tx_id=8,9 vs tx_id=1 的情況下

有什么幫助嗎? 謝謝你

這是tidyverse一個選項

library(dplyr)
library(tidyr)
library(data.table)
x %>%
    mutate(name = as.character(name)) %>% 
    separate_rows(name, sep=";") %>% 
    separate(name, into = c('key', 'value'), sep="=") %>% 
    mutate(rn = rowid(key)) %>% 
    pivot_wider(names_from = key, values_from = value) %>% 
    type.convert(as.is = TRUE) %>% 
    filter(exon_rank == 1)

您可以編寫一個讀取該類型數據的函數:

read_data <- function(data){
  read_data_row <- function(x){
    u <- read.dcf(textConnection(x))
    v <- read.csv(text=u, row.names = colnames(u), header=FALSE, na.strings = "")
    tidyr::fill(data.frame(t(v), row.names = NULL), dplyr::all_of(colnames(u)),
                .direction = 'downup')
  }
  plyr::rbind.fill(sapply(strsplit(gsub('=',':',data),';'), read_data_row))
}

現在讀取數據:

new_data <- read_data(x$name)
new_data
  tx_id        tx_name      gene_id exon_id exon_name exon_rank zero_length_insertion
1     2 XM_017927872.1 LOC108564750       4       id8         1                  <NA>
2     2 XM_017927872.1 LOC108564750      13      id17        10                  <NA>
3     2 XM_017927872.1 LOC108564750      14      id18        11                  <NA>
4     2 XM_017927872.1 LOC108564750      15      id19        12                  <NA>
5     8 XM_017919249.1 LOC108560513      70      id25         1                  <NA>
6     8 XM_017919249.1 LOC108560513      70      id25         1                  True
7     9 XM_017918469.1 LOC108560513      71      id20         1                  True
  

您可以按照自己的方式對其進行子集化:

subset(new_data, exon_rank==1)
  tx_id        tx_name      gene_id exon_id exon_name exon_rank zero_length_insertion
1     2 XM_017927872.1 LOC108564750       4       id8         1                  <NA>
5     8 XM_017919249.1 LOC108560513      70      id25         1                  <NA>
6     8 XM_017919249.1 LOC108560513      70      id25         1                  True
7     9 XM_017918469.1 LOC108560513      71      id20         1                  True 

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM