使用dplyr在大型數據框上應用自定義計數功能或應用的有效方法

Question

我有一個相當大的數據框（536 x 46000），我想計算每列次要元素的頻率。 示例數據段在此處：

require(dplyr)

m1 <- c(0:2,NA,0:2,NA)
m2 <- c(NA, NA, 0:2,NA, 0, 2)
m3 <- c(0,1,1,1,2,0,NA,2)
g1 <- seq(1:20)
dat <- as.data.frame(cbind(g1,m1,m2,m3))
dat$g1 <- as.factor(dat$g1)
dat

 g1  m1  m2  m3
 1   0   NA  0
 2   1   NA  1
 3   2   0   1
 .   .   .   .
 .   .   .   .

我編寫了一個函數來吐出次要元素的頻率。 該函數僅計算每個元素的出現，並將次要元素除以除NA以外的所有元素的總和：

 maf.fun <- function(x) { 
    m0 <- length(which(x == 0)) 
    m1 <- length(which(x == 1))
    m2 <- length(which(x == 2))
    MAF <- min(m0,m1,m2)/sum(m0,m1,m2)
    MAF 
    }

然后使用summarize_each從dplyr包以獲得由所述次要元件的頻率：

 MAF <- summarise_each(dat[,-1], funs(maf.fun))

它輸出：

  m1          m2        m3
  0.3333333   0.2       0.2857143

現在，當數據集很小時，這一切都是快速而又不錯的，但是，隨着龐大的數據幀（數千列），代碼的運行速度變得非常緩慢。

有什么有效的方法可以有效地進行這種計算嗎？

Answer 1

這是一個選擇：

fnc = function(x) min(table(x))/sum(!is.na(x))

dat %>% 
  summarise_each(funs(fnc), -g1)

但是，事實證明，這比示例數據上的maf.fun慢很多。 另一方面，如果數據可以使用許多唯一值（而不是僅0、1和2），並且可能需要大量輸入才能處理具有多個唯一值的列，則maf.fun不靈活。 因此，這里有一個函數可以處理具有任意數量唯一值的向量。 此函數的運行速度幾乎與maf.fun一樣快，但無論唯一值的數量如何，該函數都將起作用：

maf.fun2 = function(vec) {
  min(sapply(na.omit(unique(vec)), function(j) sum(vec==j, na.rm=TRUE)))/sum(!is.na(vec))
}

dat %>% summarise_each(funs(maf.fun2), -g1)

基數R等效為：

sapply(dat[, -1], maf.fun2)

這是較大數據幀上的一些時序。 需要注意的是基地sapply快於dplyr summarise_each ，並具有較大的數據幀，有沒有太大的區別fnc ， maf.fun和maf.fun2 ：

536行x 1,000列; 3種可能的列值，再加上NA ：

set.seed(10)
dat = data.frame(g1=1:536, replicate(1000, sample(c(0:2,NA), 536, replace=TRUE)))

 Unit: seconds expr min lq mean median uq max neval cld dplyr_maf.fun 0.48 0.49 0.50 0.50 0.52 0.53 5 b dplyr_fnc 0.80 0.82 0.84 0.84 0.86 0.86 5 d dplyr_maf.fun2 0.56 0.57 0.59 0.60 0.60 0.62 5 c sapply_maf.fun2 0.10 0.10 0.11 0.11 0.11 0.12 5 a

536行x 1,000列; 100個可能的列值，再加上NA ：

set.seed(10)
dat = data.frame(g1=1:536, replicate(1000, sample(c(1:100,NA), 536, replace=TRUE)))

 Unit: seconds expr min lq mean median uq max neval cld dplyr_fnc 0.90 0.91 0.92 0.92 0.92 0.93 5 b dplyr_maf.fun2 0.99 1.02 1.05 1.03 1.09 1.11 5 c sapply_maf.fun2 0.52 0.54 0.56 0.56 0.57 0.62 5 a

Answer 2

作為我的評論的替代方法，這里嘗試使用table加快計算速度：

maf.fun <- function(x) { 
  myTable <- table(x)
  myTable <- myTable[names(myTable) %in% c("0", "1", "2")]

  min(myTable) / sum(myTable)
}

Answer 3

主要因素是用於每列的函數，而不是自定義函數本身。 以下是一些基准：

library(dplyr)
library(data.table)
library(microbenchmark)
dat1 <- as.data.table(dat)
cols <- colnames(dat1)[2:length(dat1)]


# wheatSingh
maf.fun1 <- function(x) { 
    m0 <- length(which(x == 0)) 
    m1 <- length(which(x == 1))
    m2 <- length(which(x == 2))
    MAF <- min(m0,m1,m2)/sum(m0,m1,m2)
    MAF 
}


# lmo1
maf.fun2 <- function(x) { 
    m0 <- sum(x == 0, na.rm = T) 
    m1 <- sum(x == 1, na.rm = T)
    m2 <- sum(x == 2, na.rm = T)
    MAF <- min(m0,m1,m2)/sum(m0,m1,m2)
    MAF 
}


# lmo2
maf.fun3 <- function(x) { 
    myTable <- table(x)
    myTable <- myTable[names(myTable) %in% c("0", "1", "2")]

    min(myTable) / sum(myTable)
}


# sumedh
maf.fun4 <- function(x) {
    x1 <- tabulate(x + 1)
    x1 <- x1[x1!=0]
    x2 <- min(x1)/sum(x1)
    return(x2)
}


# eipi10 1
maf.fun5 <- function(x) {
    min(table(x))/sum(!is.na(x))
}


# eipi10 2
maf.fun6 <- function(vec) {
    min(sapply(na.omit(unique(vec)), function(j) sum(vec==j, na.rm=TRUE)))/sum(!is.na(vec))

}


# summarise each
wheatSingh_each  <- function(x) summarise_each(x, funs(maf.fun1), -g1)
lmo1_each        <- function(x) summarise_each(x, funs(maf.fun2), -g1)
lmo2_each        <- function(x) summarise_each(x, funs(maf.fun3), -g1)
sumedh_each      <- function(x) summarise_each(x, funs(maf.fun4), -g1)
eipi10_each      <- function(x) summarise_each(x, funs(maf.fun5), -g1)
eipi10_each2     <- function(x) summarise_each(x, funs(maf.fun6), -g1)

microbenchmark(wheatSingh_each(dat), lmo1_each(dat), lmo2_each(dat),
               sumedh_each(dat), eipi10_each(dat), eipi10_each2(dat), unit = "ms")

Unit: milliseconds
                 expr      min       lq     mean   median       uq      max neval
 wheatSingh_each(dat) 1.260625 1.292623 1.385346 1.332168 1.414579 3.071865   100
       lmo1_each(dat) 1.258813 1.288095 1.387961 1.362054 1.430579 2.224808   100
       lmo2_each(dat) 1.782865 1.826939 1.962498 1.909652 2.009874 2.580416   100
     sumedh_each(dat) 1.270888 1.298057 1.431485 1.353300 1.416994 3.170276   100
     eipi10_each(dat) 1.700756 1.752377 1.896515 1.845957 1.921728 3.758326   100
    eipi10_each2(dat) 1.425448 1.482200 1.606445 1.555556 1.628910 3.496904   100

# sapply
wheatSingh_sapply <- function(x) sapply(x, maf.fun1)
lmo1_sapply       <- function(x) sapply(x, maf.fun2)
lmo2_sapply       <- function(x) sapply(x, maf.fun3)
sumedh_sapply     <- function(x) sapply(x, maf.fun4)
eipi10_sapply     <- function(x) sapply(x[, names(dat) != "g1"], maf.fun5)
eipi10_sapply2    <- function(x) sapply(x[, names(dat) != "g1"], maf.fun6)


microbenchmark(wheatSingh_sapply(dat[,-1]), lmo1_sapply(dat[,-1]), 
               lmo2_sapply(dat[,-1]), sumedh_sapply(dat[,-1]),
               eipi10_sapply(dat), eipi10_sapply2(dat), unit = "ms")

Unit: milliseconds
                         expr      min        lq       mean    median        uq      max neval
 wheatSingh_sapply(dat[, -1]) 0.061583 0.0664130 0.07586755 0.0760730 0.0802995 0.114712   100
       lmo1_sapply(dat[, -1]) 0.054942 0.0597720 0.06603859 0.0639975 0.0700350 0.095393   100
       lmo2_sapply(dat[, -1]) 0.482394 0.5062425 0.52361843 0.5216380 0.5337130 0.607370   100
     sumedh_sapply(dat[, -1]) 0.063395 0.0694320 0.07805922 0.0754695 0.0812045 0.118336   100
           eipi10_sapply(dat) 0.420812 0.4431510 0.46422176 0.4603580 0.4787725 0.636954   100
          eipi10_sapply2(dat) 0.171464 0.1880680 0.20320984 0.1965200 0.2155380 0.329646   100



# data.table
wheatSingh_dt       <- function(x) t(x[,.(ans=lapply(.SD, maf.fun1)),.SDcols=cols])
lmo1_dt             <- function(x) t(x[,.(ans=lapply(.SD, maf.fun2)),.SDcols=cols])
lmo2_dt             <- function(x) t(x[,.(ans=lapply(.SD, maf.fun3)),.SDcols=cols])
sumedh_dt           <- function(x) t(x[,.(ans=lapply(.SD, maf.fun4)),.SDcols=cols])
eipi10_dt           <- function(x) t(x[,.(ans=lapply(.SD, maf.fun5)),.SDcols=cols])
eipi10_dt2          <- function(x) t(x[,.(ans=lapply(.SD, maf.fun6)),.SDcols=cols])



microbenchmark(wheatSingh_dt(dat1), lmo1_dt(dat1), lmo2_dt(dat1), sumedh_dt(dat1), 
               eipi10_dt(dat1), eipi10_dt2(dat1), unit = "ms") 



Unit: milliseconds
                expr      min        lq      mean    median        uq      max neval
 wheatSingh_dt(dat1) 0.737780 0.7700795 0.8260051 0.8050970 0.8467555 1.307717   100
       lmo1_dt(dat1) 0.717856 0.7773255 0.8248158 0.8093235 0.8401145 1.397071   100
       lmo2_dt(dat1) 1.232248 1.2971515 1.3635452 1.3454505 1.4046180 2.021950   100
     sumedh_dt(dat1) 0.737176 0.7743060 0.8260775 0.8096255 0.8527940 1.364469   100
     eipi10_dt(dat1) 1.159195 1.2156455 1.3718648 1.2548890 1.3385075 4.757527   100
    eipi10_dt2(dat1) 0.869397 0.9095455 0.9691232 0.9512035 1.0082580 1.246738   100

注意：僅當列中的值為整數時，我的解決方案（使用tabulate maf.fun4 ）才能工作

@ eipi10提供的較大數據框上的結果：

set.seed(10)
dat = data.frame(g1=1:536, replicate(1000, sample(c(1:100,NA), 536, replace=TRUE)))

sumedh_sapply     <- function(x) sapply(x, maf.fun4)
eipi10_sapply2     <- function(x) sapply(x, maf.fun6)

identical(sumedh_sapply(dat[,-1]), eipi10_sapply2(dat[,-1]))
[1] TRUE

microbenchmark(sumedh_sapply(dat[,-1]), eipi10_sapply2(dat[,-1]), unit = "s")

Unit: seconds
                      expr        min         lq       mean     median         uq        max neval cld
  sumedh_sapply(dat[, -1]) 0.01308923 0.01393871 0.01615033 0.01714913 0.01766564 0.02056302   100  a 
 eipi10_sapply2(dat[, -1]) 0.40788421 0.42277774 0.44252427 0.42845991 0.43098025 0.56735431   100   b

使用dplyr在大型數據框上應用自定義計數功能或應用的有效方法

問題描述

3 個解決方案

解決方案1
2 2016-07-16 16:48:51

解決方案2
1 2016-07-16 02:55:27

解決方案3
1 2016-07-16 03:31:54

使用dplyr在大型數據框上應用自定義計數功能或應用的有效方法

問題描述

3 個解決方案

解決方案1 2 2016-07-16 16:48:51

解決方案2 1 2016-07-16 02:55:27

解決方案3 1 2016-07-16 03:31:54

解決方案1
2 2016-07-16 16:48:51

解決方案2
1 2016-07-16 02:55:27

解決方案3
1 2016-07-16 03:31:54