![](/img/trans.png)
[英]Is there an R function that returns the alignment score of aligned DNA sequences?
[英]How to improve the performance of an R function that returns coordinates for aligned sequences?
我有一个对齐坐标的数据表。 例如,从“start_A”到“end_A”的序列与“chromosome_B”上从“start_B”到“end_B”的序列对齐:
library(data.table)
set.seed(1)
seq2 <- Vectorize(seq.default, vectorize.args = c("from", "to"))
pos1 <- sort(sample(1:1000, 40))
start_A <- pos1[(1:20)*2 - 1]
end_A <- pos1[(1:20)*2]
start_B <- sample(1:1000, 20)
end_B <- start_B + (end_A - start_A)
alignment <- data.table(start_A = start_A,
end_A = end_A,
chromosome_B = c(2, 2, 4, 4, 1, 1, 1, 3, 3, 3, 2, 2, 4, 4, 1, 1, 1, 2, 2, 2),
start_B = start_B,
end_B = end_B)
和一个坐标向量。 例如:
pos <- sample(unlist(seq2(alignment$start_A, alignment$end_A)), 15)
我写了一个函数,给出两个输入,将返回每个对齐坐标的坐标和染色体。 例如:
alakazam <- function(alignment, pos){
colnames(alignment) <-
c("start_A","end_A","chromosome_B","start_B","end_B")
vec_list_A <- seq2(alignment$start_A, alignment$end_A)
vec_list_B <- seq2(alignment$start_B, alignment$end_B)
out_list <- list()
for (i in 1:length(pos)){
pos_A <- pos[i]
pos_B <- rep(NA, length(vec_list_A))
for (j in 1:length(vec_list_A)){
tmp1 <- vec_list_A[[j]]
tmp2 <- vec_list_B[[j]]
if (length(tmp1[which(tmp1 %in% pos_A)]) != 0){
pos_B[j] <- tmp2[which(tmp1 %in% pos_A)]
}
out <- data.table(pos_A = rep(pos_A, length(vec_list_A)),
chromosome_B = alignment$chromosome_B,
pos_B = pos_B)
out <- out[complete.cases(out),]
print(j)
}
out_list[[i]] <- out
print(i)
}
output <- do.call("rbind", out_list)
return(output)
}
output <- alakazam(alignment, pos)
然而,我需要应用它的数据非常大,而且函数太慢。 有没有人对如何提高性能有任何建议?
谢谢!
Minem 看起来很接近,但我认为应该保留原始的“start_A”坐标以获得正确的结果。
posDT <- data.table(order1 = seq_along(pos),
pos1 = pos,
pos2 = pos,
key = 'pos1')
alignment[,start_A1 := start_A]
v1 <- alignment[posDT, on = .(start_A <= pos2, end_A >= pos2)][,-c("start_A", "end_A")]
v1[, pos_B := pos1-start_A1 + start_B]
setorder(v1, order1)
out <- v1[, .(pos_A = pos1, chromosome_B, pos_B)]
out
似乎你一遍又一遍地问同样的问题( 如何计算序列坐标的向量和数据帧之间的匹配? )
在这里你犯了一个错误
out <- data.table(pos_A = rep(pos_A, length(vec_list_A)),
chromosome_B = alignment$chromosome_B,
pos_B = pos_B)
out <- out[complete.cases(out),]
在循环中太深一层。 大概应该是这样的:
for (i in 1:length(pos)){
pos_A <- pos[i]
pos_B <- rep(NA, length(vec_list_A))
for (j in 1:length(vec_list_A)){
tmp1 <- vec_list_A[[j]]
tmp2 <- vec_list_B[[j]]
if (length(tmp1[which(tmp1 %in% pos_A)]) != 0){
pos_B[j] <- tmp2[which(tmp1 %in% pos_A)]
}
}
out <- data.table(pos_A = rep(pos_A, length(vec_list_A)),
chromosome_B = alignment$chromosome_B,
pos_B = pos_B)
out <- out[complete.cases(out),]
out_list[[i]] <- out
print(i)
}
这应该更快,但我不确定这是否会给出您想要的结果。
使用 data.table 合并:
posDT <- data.table(order1 = seq_along(pos),
pos1 = pos,
pos2 = pos,
key = 'pos1')
v1 <- alignment[posDT, on = .(start_A <= pos2, end_A >= pos2)]
v1[, pos_B := start_B + (start_A - pos1)]
setorder(v1, order1)
out <- v1[, .(pos_A = pos1, chromosome_B, pos_B)]
out
结果有点不同:
pos_A chromosome_B pos_B
1: 433 1 343
2: 975 2 810
3: 749 4 375
4: 936 2 435
5: 81 2 404
6: 621 2 684
7: 58 2 404
8: 297 4 506
9: 551 3 121
10: 719 2 537
11: 202 4 532
12: 492 3 582
13: 712 2 537
14: 910 1 39
15: 909 1 39
matching_pos <- function(alignment, pos){
colnames(alignment) <- c("start_A", "end_A", "chromosome_B", "start_B", "end_B")
# convert pos to dt
posDT <- data.table(pos1 = pos, pos2 = pos, key = 'pos1')
# merge dt
v1 <- alignment[posDT, on = .(start_A <= pos2, end_A >= pos2)]
v1 <- v1[,c("chromosome_B", "start_B", "end_B", "pos1")]
v1 <- alignment[v1, on = c("chromosome_B", "start_B", "end_B")]
# subset by direction
v1_plus <- v1[end_B >= start_B]
v1_minus <- v1[start_B > end_B]
# calculate difference and subtract
v1_plus$diff <- v1_plus$pos1 - v1_plus$start_A
v1_plus$pos2 <- v1_plus$start_B + v1_plus$diff
v1_minus$diff <- v1_minus$pos1 - v1_minus$start_A
v1_minus$pos2 <- v1_minus$start_B - v1_minus$diff
v1 <- rbind(v1_plus, v1_minus)
out <- v1[,c("start_A","end_A", "pos1", "start_B","end_B", "pos2", "chromosome_B")]
colnames(out) <- c("start_A","end_A", "pos_A", "start_B","end_B", "pos_B", "chromosome_B")
# out <- v1[,c("pos1", "pos2", "chromosome_B")]
# colnames(out) <- c("pos_A", "pos_B", "chromosome_B")
out <- out[order(pos_A)]
return(out)
}
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.