[英]Speeding up matrix row and column operations in R
我有一個積極的大矩陣:
set.seed(1)
mat <- matrix(abs(rnorm(130000*1000)),nrow=130000,ncol=1000)
rownames(mat) <- paste("r",1:nrow(mat),sep="")
該rownames
的mat
與一個關聯parent.id
:
row.ids.df <- data.frame(row.id=rownames(mat),parent.id=paste("p",sort(sample(13000,130000,replace=T)),sep=""))
這樣每隔幾行就與同一個parent.id
相關聯。
我需要為mat
每一row
計算這些操作:
行元素的對log
mean
在具有相同parent.id
的所有行中該行的mean
比例
具有相同parent.id
的所有行中該行所占比例的mean
概率
具有相同parent.id
的所有行中該行所占比例的sd
概率
自然,這是想到的第一個解決方案:
require(VGAM)
res.df <- do.call(rbind,mclapply(1:nrow(mat), function(x) {
idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])
data.frame(mean.log=mean(log(mat[x,])),
mean.proportion=mean(mat[x,]/apply(mat[idx,],2,sum)),
mean.probit=mean(probit(mat[x,]/apply(mat[idx,],2,sum))),
sd.probit=sd(probit(mat[x,]/apply(mat[idx,],2,sum))))
}))
但是我想知道是否有任何方法可以更快地實現這一目標。
聚苯乙烯
我不認為使用data.table
而不是matrix是data.table
的方法:
require(data.table)
require(microbenchmark)
require(VGAM)
set.seed(1)
mat <- data.table(matrix(abs(rnorm(13*5)),nrow=13,ncol=5))
rownames(mat) <- paste("r",1:nrow(mat),sep="")
row.ids.df <- data.frame(row.id=rownames(mat),parent.id=paste("p",sort(sample(2,13,replace=T)),sep=""))
microbenchmark(df <- do.call(rbind,lapply(1:nrow(mat), function(x) {
idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])
data.frame(mean.log=mean(as.numeric(log(mat[x,]))),
mean.proportion=mean(as.numeric(mat[x,])/apply(mat[idx,],2,sum)),
mean.probit=mean(probit(as.numeric(mat[x,])/apply(mat[idx,],2,sum))),
sd.probit=sd(probit(as.numeric(mat[x,])/apply(mat[idx,],2,sum))))
})))
expr
df <- do.call(rbind, lapply(1:nrow(mat), function(x) { idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])]) data.frame(mean.log = mean(as.numeric(log(mat[x, ]))), mean.proportion = mean(as.numeric(mat[x, ])/apply(mat[idx, ], 2, sum)), mean.probit = mean(probit(as.numeric(mat[x, ])/apply(mat[idx, ], 2, sum))), sd.probit = sd(probit(as.numeric(mat[x, ])/apply(mat[idx, ], 2, sum)))) }))
min lq mean median uq max neval
65.08929 66.49415 69.78937 67.70534 70.38044 206.017 100
>
相比:
set.seed(1)
mat <- matrix(abs(rnorm(13*5)),nrow=13,ncol=5)
rownames(mat) <- paste("r",1:nrow(mat),sep="")
row.ids.df <- data.frame(row.id=rownames(mat),parent.id=paste("p",sort(sample(2,13,replace=T)),sep=""))
require(VGAM)
microbenchmark(df <- do.call(rbind,lapply(1:nrow(mat), function(x) {
idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])
data.frame(mean.log=mean(log(mat[x,])),
mean.proportion=mean(mat[x,]/apply(mat[idx,],2,sum)),
mean.probit=mean(probit(mat[x,]/apply(mat[idx,],2,sum))),
sd.probit=sd(probit(mat[x,]/apply(mat[idx,],2,sum))))
})))
Unit: milliseconds
expr
df <- do.call(rbind, lapply(1:nrow(mat), function(x) { idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])]) data.frame(mean.log = mean(log(mat[x, ])), mean.proportion = mean(mat[x, ]/apply(mat[idx, ], 2, sum)), mean.probit = mean(probit(mat[x, ]/apply(mat[idx, ], 2, sum))), sd.probit = sd(probit(mat[x, ]/apply(mat[idx, ], 2, sum)))) }))
min lq mean median uq max neval
10.15047 10.2894 10.69573 10.428 10.69741 14.56724 100
除非申請as.numeric
我想要運行的操作,每次data.table
行是一個壞主意。
我不認為使用data.table代替matrix是可行的方法
顯然,您必須實際使用data.table。 這不是在無需花費任何精力的情況下優化代碼的魔杖。 您需要使用data.table語法。
我需要為mat中的每一行計算這些操作:
mean of log of the row elements mean proportion of of that row out of all rows with the same parent.id mean probit of the proportion of of that row out of all rows with the same parent.id sd probit of the proportion of of that row out of all rows with the same parent.id
我認為這可能會滿足您的需求:
library(data.table)
DT <- data.table(row.ids.df, mat)
DT <- melt(DT, id.vars = c("row.id", "parent.id"))
DT[, proportion := value / sum(value), by = .(variable, parent.id)]
res <- DT[, .(
mean.log = mean(log(value)),
mean.proportion = mean(proportion),
mean.probit = mean(probit(proportion)),
sd.probit = sd(probit(proportion))), by = row.id]
all.equal(res[["sd.probit"]],
res.df[["sd.probit"]])
#[1] TRUE
#(Tested with 100 rows and 30 columns.)
我希望它會更高效,但是絕對可讀。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.