[英]Speeding up matrix row and column operations in R
我有一个积极的大矩阵:
set.seed(1)
mat <- matrix(abs(rnorm(130000*1000)),nrow=130000,ncol=1000)
rownames(mat) <- paste("r",1:nrow(mat),sep="")
该rownames
的mat
与一个关联parent.id
:
row.ids.df <- data.frame(row.id=rownames(mat),parent.id=paste("p",sort(sample(13000,130000,replace=T)),sep=""))
这样每隔几行就与同一个parent.id
相关联。
我需要为mat
每一row
计算这些操作:
行元素的对log
mean
在具有相同parent.id
的所有行中该行的mean
比例
具有相同parent.id
的所有行中该行所占比例的mean
概率
具有相同parent.id
的所有行中该行所占比例的sd
概率
自然,这是想到的第一个解决方案:
require(VGAM)
res.df <- do.call(rbind,mclapply(1:nrow(mat), function(x) {
idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])
data.frame(mean.log=mean(log(mat[x,])),
mean.proportion=mean(mat[x,]/apply(mat[idx,],2,sum)),
mean.probit=mean(probit(mat[x,]/apply(mat[idx,],2,sum))),
sd.probit=sd(probit(mat[x,]/apply(mat[idx,],2,sum))))
}))
但是我想知道是否有任何方法可以更快地实现这一目标。
聚苯乙烯
我不认为使用data.table
而不是matrix是data.table
的方法:
require(data.table)
require(microbenchmark)
require(VGAM)
set.seed(1)
mat <- data.table(matrix(abs(rnorm(13*5)),nrow=13,ncol=5))
rownames(mat) <- paste("r",1:nrow(mat),sep="")
row.ids.df <- data.frame(row.id=rownames(mat),parent.id=paste("p",sort(sample(2,13,replace=T)),sep=""))
microbenchmark(df <- do.call(rbind,lapply(1:nrow(mat), function(x) {
idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])
data.frame(mean.log=mean(as.numeric(log(mat[x,]))),
mean.proportion=mean(as.numeric(mat[x,])/apply(mat[idx,],2,sum)),
mean.probit=mean(probit(as.numeric(mat[x,])/apply(mat[idx,],2,sum))),
sd.probit=sd(probit(as.numeric(mat[x,])/apply(mat[idx,],2,sum))))
})))
expr
df <- do.call(rbind, lapply(1:nrow(mat), function(x) { idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])]) data.frame(mean.log = mean(as.numeric(log(mat[x, ]))), mean.proportion = mean(as.numeric(mat[x, ])/apply(mat[idx, ], 2, sum)), mean.probit = mean(probit(as.numeric(mat[x, ])/apply(mat[idx, ], 2, sum))), sd.probit = sd(probit(as.numeric(mat[x, ])/apply(mat[idx, ], 2, sum)))) }))
min lq mean median uq max neval
65.08929 66.49415 69.78937 67.70534 70.38044 206.017 100
>
相比:
set.seed(1)
mat <- matrix(abs(rnorm(13*5)),nrow=13,ncol=5)
rownames(mat) <- paste("r",1:nrow(mat),sep="")
row.ids.df <- data.frame(row.id=rownames(mat),parent.id=paste("p",sort(sample(2,13,replace=T)),sep=""))
require(VGAM)
microbenchmark(df <- do.call(rbind,lapply(1:nrow(mat), function(x) {
idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])
data.frame(mean.log=mean(log(mat[x,])),
mean.proportion=mean(mat[x,]/apply(mat[idx,],2,sum)),
mean.probit=mean(probit(mat[x,]/apply(mat[idx,],2,sum))),
sd.probit=sd(probit(mat[x,]/apply(mat[idx,],2,sum))))
})))
Unit: milliseconds
expr
df <- do.call(rbind, lapply(1:nrow(mat), function(x) { idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])]) data.frame(mean.log = mean(log(mat[x, ])), mean.proportion = mean(mat[x, ]/apply(mat[idx, ], 2, sum)), mean.probit = mean(probit(mat[x, ]/apply(mat[idx, ], 2, sum))), sd.probit = sd(probit(mat[x, ]/apply(mat[idx, ], 2, sum)))) }))
min lq mean median uq max neval
10.15047 10.2894 10.69573 10.428 10.69741 14.56724 100
除非申请as.numeric
我想要运行的操作,每次data.table
行是一个坏主意。
我不认为使用data.table代替matrix是可行的方法
显然,您必须实际使用data.table。 这不是在无需花费任何精力的情况下优化代码的魔杖。 您需要使用data.table语法。
我需要为mat中的每一行计算这些操作:
mean of log of the row elements mean proportion of of that row out of all rows with the same parent.id mean probit of the proportion of of that row out of all rows with the same parent.id sd probit of the proportion of of that row out of all rows with the same parent.id
我认为这可能会满足您的需求:
library(data.table)
DT <- data.table(row.ids.df, mat)
DT <- melt(DT, id.vars = c("row.id", "parent.id"))
DT[, proportion := value / sum(value), by = .(variable, parent.id)]
res <- DT[, .(
mean.log = mean(log(value)),
mean.proportion = mean(proportion),
mean.probit = mean(probit(proportion)),
sd.probit = sd(probit(proportion))), by = row.id]
all.equal(res[["sd.probit"]],
res.df[["sd.probit"]])
#[1] TRUE
#(Tested with 100 rows and 30 columns.)
我希望它会更高效,但是绝对可读。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.