簡體   English   中英

拆分字符串以在R中的數據文件中創建不同的列

[英]Splitting string to make different columns in a data file in R

我有一個具有以下格式的數據文件

2       C4b     6382078 6381944 chr20|C4b:6382078|C4b:6381944|reg|-     2       Thymus_M_GSM1328751     reg
4       Rpl4    68832532        68832743        chr8|Rpl4:68832532|Rpl4:68832743|reg|+  4       Thymus_M_GSM1328751     reg
3       Dntt    267744370       267746423       chr1|Dntt:267744370|Dntt:267746423|reg|+        3       Thymus_M_GSM1328751     reg
2       Sptbn1  114201107       114200202       chr14|Sptbn1:114201107|Sptbn1:114200202|reg|-   2       Thymus_M_GSM1328751     reg
2       Ndufb7  35680273        35683909        chr19|Ndufb7:35680273|Ndufb7:35683909|reg|+     2       Thymus_M_GSM1328751     reg

現在,為簡單起見,讓我們看一下第一行:

2       C4b     6382078 6381944 chr20|C4b:6382078|C4b:6381944|reg|-     2       Thymus_M_GSM1328751     reg

這里最重要的部分是第五列: $ chr20|C4b:6382078|C4b:6381944|reg|- $和第六列。

我基本上想在第五列中說出chr20,C4b,6382078,C4b,6381944,reg的V1-V7列。 然后將V8列作為原始數據的第六列。 所以,基本上我想擁有:

V1     V2   V3     V4   V5      V6    V7     V8     
chr20 C4b 6382078 C4b  6381944  reg   +     Thymus_M_GSM1328751 

有人可以幫我嗎? 我真的不知道如何在R中處理此類字符串。

調用表df

library(stringr)
result <- do.call(rbind,lapply(df$V5,function(x)unlist(str_extract_all(x,"[^\\:\\|]+"))))
result <- data.frame(result,X8=df$V7)
result
#      X1     X2        X3     X4        X5  X6 X7                  X8
# 1 chr20    C4b   6382078    C4b   6381944 reg  - Thymus_M_GSM1328751
# 2  chr8   Rpl4  68832532   Rpl4  68832743 reg  + Thymus_M_GSM1328751
# 3  chr1   Dntt 267744370   Dntt 267746423 reg  + Thymus_M_GSM1328751
# 4 chr14 Sptbn1 114201107 Sptbn1 114200202 reg  - Thymus_M_GSM1328751
# 5 chr19 Ndufb7  35680273 Ndufb7  35683909 reg  + Thymus_M_GSM1328751

正則表達式可捕獲具有一個或多個不包含“ |”的任何字符的任何內容 要么 ”:”。

如果您的表非常大,則使用data.tables的解決方案可能會更快。

library(data.table)
setDT(df)           # convert df to a data.table
result <- df[,as.list(c(unlist(str_extract_all(V5,"[^\\:\\|]+")),as.character(V7))),
             by=1:nrow(df)]
result
#    nrow    V1     V2        V3     V4        V5  V6 V7                  V8
# 1:    1 chr20    C4b   6382078    C4b   6381944 reg  - Thymus_M_GSM1328751
# 2:    2  chr8   Rpl4  68832532   Rpl4  68832743 reg  + Thymus_M_GSM1328751
# 3:    3  chr1   Dntt 267744370   Dntt 267746423 reg  + Thymus_M_GSM1328751
# 4:    4 chr14 Sptbn1 114201107 Sptbn1 114200202 reg  - Thymus_M_GSM1328751
# 5:    5 chr19 Ndufb7  35680273 Ndufb7  35683909 reg  + Thymus_M_GSM1328751

您可以使用strsplit函數,而無需任何其他庫:

ddf = structure(list(V1 = c(2L, 4L, 3L, 2L, 2L), V2 = c("C4b", "Rpl4", 
"Dntt", "Sptbn1", "Ndufb7"), V3 = c(6382078L, 68832532L, 267744370L, 
114201107L, 35680273L), V4 = c(6381944L, 68832743L, 267746423L, 
114200202L, 35683909L), V5 = c("chr20|C4b:6382078|C4b:6381944|reg|-", 
"chr8|Rpl4:68832532|Rpl4:68832743|reg|+", "chr1|Dntt:267744370|Dntt:267746423|reg|+", 
"chr14|Sptbn1:114201107|Sptbn1:114200202|reg|-", "chr19|Ndufb7:35680273|Ndufb7:35683909|reg|+"
), V6 = c(2L, 4L, 3L, 2L, 2L), V7 = c("Thymus_M_GSM1328751", 
"Thymus_M_GSM1328751", "Thymus_M_GSM1328751", "Thymus_M_GSM1328751", 
"Thymus_M_GSM1328751"), V8 = c("reg", "reg", "reg", "reg", "reg"
)), .Names = c("V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8"
), class = "data.frame", row.names = c(NA, -5L))
> 
> ddf
  V1     V2        V3        V4                                            V5 V6                  V7  V8
1  2    C4b   6382078   6381944           chr20|C4b:6382078|C4b:6381944|reg|-  2 Thymus_M_GSM1328751 reg
2  4   Rpl4  68832532  68832743        chr8|Rpl4:68832532|Rpl4:68832743|reg|+  4 Thymus_M_GSM1328751 reg
3  3   Dntt 267744370 267746423      chr1|Dntt:267744370|Dntt:267746423|reg|+  3 Thymus_M_GSM1328751 reg
4  2 Sptbn1 114201107 114200202 chr14|Sptbn1:114201107|Sptbn1:114200202|reg|-  2 Thymus_M_GSM1328751 reg
5  2 Ndufb7  35680273  35683909   chr19|Ndufb7:35680273|Ndufb7:35683909|reg|+  2 Thymus_M_GSM1328751 reg
> 

ddf = data.frame(ddf, do.call(rbind, strsplit(as.character(ddf$V5),'\\|')))
ddf = data.frame(ddf, do.call(rbind, strsplit(as.character(ddf$X2),':')))
ddf = data.frame(ddf, do.call(rbind, strsplit(as.character(ddf$X3),':')))

ddf= ddf[,c(-5,-10,-11)]
ddf
  V1     V2        V3        V4 V6                  V7  V8    X1  X4 X5   X1.1      X2.1   X1.2      X2.2
1  2    C4b   6382078   6381944  2 Thymus_M_GSM1328751 reg chr20 reg  -    C4b   6382078    C4b   6381944
2  4   Rpl4  68832532  68832743  4 Thymus_M_GSM1328751 reg  chr8 reg  +   Rpl4  68832532   Rpl4  68832743
3  3   Dntt 267744370 267746423  3 Thymus_M_GSM1328751 reg  chr1 reg  +   Dntt 267744370   Dntt 267746423
4  2 Sptbn1 114201107 114200202  2 Thymus_M_GSM1328751 reg chr14 reg  - Sptbn1 114201107 Sptbn1 114200202
5  2 Ndufb7  35680273  35683909  2 Thymus_M_GSM1328751 reg chr19 reg  + Ndufb7  35680273 Ndufb7  35683909
> 

可以使用以下命令以任何順序重新排列列:

ddf[,c(5,4,3,2,1)]

您可以使用stringr::str_split_fixed ,它允許您指定結果的長度。 在這種情況下,我告訴我要返回10列。 調用data.frame后,它們更容易刪除

> row <- 'chr20|C4b:6382078|C4b:6381944|reg|-     2       Thymus_M_GSM1328751     reg'
> str_split_fixed(row, "[[:space:]]+|[:]|[|]", n = 10)
#      [,1]    [,2]  [,3]      [,4]  [,5]      [,6]  [,7] [,8] [,9]                  [,10]
# [1,] "chr20" "C4b" "6382078" "C4b" "6381944" "reg" "-"  "2"  "Thymus_M_GSM1328751" "reg"

@MrFlick的評論轉換為答案,我會考慮我的cSplit函數 ,目前可以作為GitHub Gist使用。

使用該函數,您可以使用fixed = FALSE參數來使用多個分隔符,如下所示:

library(devtools)     ## Convenient for loading cSplit
source_gist(11380733)

cSplit(ddf, "V5", "\\||:", fixed = FALSE)
# Loading required package: data.table
# data.table 1.9.2  For help type: help("data.table")
#    V1     V2        V3        V4 V6                  V7  V8  V5_1   V5_2
# 1:  2    C4b   6382078   6381944  2 Thymus_M_GSM1328751 reg chr20    C4b
# 2:  4   Rpl4  68832532  68832743  4 Thymus_M_GSM1328751 reg  chr8   Rpl4
# 3:  3   Dntt 267744370 267746423  3 Thymus_M_GSM1328751 reg  chr1   Dntt
# 4:  2 Sptbn1 114201107 114200202  2 Thymus_M_GSM1328751 reg chr14 Sptbn1
# 5:  2 Ndufb7  35680273  35683909  2 Thymus_M_GSM1328751 reg chr19 Ndufb7
#         V5_3   V5_4      V5_5 V5_6 V5_7
# 1:   6382078    C4b   6381944  reg    -
# 2:  68832532   Rpl4  68832743  reg    +
# 3: 267744370   Dntt 267746423  reg    +
# 4: 114201107 Sptbn1 114200202  reg    -
# 5:  35680273 Ndufb7  35683909  reg    +

此答案使用@rnso答案中的樣本數據。

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM