Compute with data.table: how many 2x2 non NA values there are among the variables?

Question

Let's say I have this data.table (actual data is 25061 x 5862):

require(data.table)
df
  # gene     P1     P2     P3     P4     P5
 # 1: gene1  0.111  0.319  0.151     NA -0.397
 # 2: gene10  1.627  2.252  1.462 -1.339 -0.644
 # 3: gene2 -1.766 -0.056 -0.369  1.910  0.981
 # 4: gene3 -1.346  1.283  0.322 -0.465  0.403
 # 5: gene4 -0.783     NA -0.005  1.761  0.066
 # 6: gene5  0.386 -0.309 -0.886 -0.072  0.161
 # 7: gene6  0.547 -0.144 -0.725 -0.133  1.059
 # 8: gene7  0.785 -1.827  0.986  1.555 -0.798
 # 9: gene8 -0.186     NA  0.401  0.900 -1.075
# 10: gene9 -0.177  1.497 -1.370 -1.628 -1.044

I'd like to know how, making advantage of the data.table structure, I can efficiently compute, for each pair of gene values, how many couples there are with no NA. For example, for the pair gene1, gene2, I'd like the result to be 4.

With base R, I do it this way:

calc_nonNA <- !is.na(df[, -1, with=F])
Effectifs <- calc_nonNA %*% t(calc_nonNA)
# or, as suggested by @DavidArenburg and @Khashaa, more efficiently:
Effectifs <- tcrossprod(calc_nonNA)

But, with a large df, it takes hours...

My desired output, with the provided example is this:

       gene1 gene10 gene2 gene3 gene4 gene5 gene6 gene7 gene8 gene9
gene1      4      4     4     4     3     4     4     4     3     4
gene10     4      5     5     5     4     5     5     5     4     5
gene2      4      5     5     5     4     5     5     5     4     5
gene3      4      5     5     5     4     5     5     5     4     5
gene4      3      4     4     4     4     4     4     4     4     4
gene5      4      5     5     5     4     5     5     5     4     5
gene6      4      5     5     5     4     5     5     5     4     5
gene7      4      5     5     5     4     5     5     5     4     5
gene8      3      4     4     4     4     4     4     4     4     4
gene9      4      5     5     5     4     5     5     5     4     5

data

df <- structure(list(gene = c("gene1", "gene10", "gene2", "gene3", 
"gene4", "gene5", "gene6", "gene7", "gene8", "gene9"), P1 = c(0.111, 
1.627, -1.766, -1.346, -0.783, 0.386, 0.547, 0.785, -0.186, -0.177
), P2 = c(0.319, 2.252, -0.056, 1.283, NA, -0.309, -0.144, -1.827, 
NA, 1.497), P3 = c(0.151, 1.462, -0.369, 0.322, -0.005, -0.886, 
-0.725, 0.986, 0.401, -1.37), P4 = c(NA, -1.339, 1.91, -0.465, 
1.761, -0.072, -0.133, 1.555, 0.9, -1.628), P5 = c(-0.397, -0.644, 
0.981, 0.403, 0.066, 0.161, 1.059, -0.798, -1.075, -1.044)), .Names = c("gene", 
"P1", "P2", "P3", "P4", "P5"), class = c("data.table", "data.frame"
), row.names = c(NA, -10L), .internal.selfref = <pointer: 0x022524a0>)

Answer 1

Using dplyr , transform data wide to long, then join to itself and summarise. Not sure if it is more efficient than your solution, some benchmarking anyone?

library(dplyr)
library(tidyr)

# reshaping from wide to long
x <- df %>% gather(key = P, value = value, -c(1)) %>% 
  mutate(value=(!is.na(value)))

# result
left_join(x,x,by="P") %>% 
  group_by(gene.x,gene.y) %>% 
  summarise(N=sum(value.x & value.y)) %>% 
  spread(gene.y,N)

EDIT: Shame, this dplyr solution fails for bigger dataset 2600x600, can't join to itself - internal vecseq reached physical limit , about 2^31 rows...

By the way, here is benchmark for t vs tcrossprod :

library(ggplot2)
library(microbenchmark)

op <- microbenchmark(
  BASE_t={
    calc_nonNA <- !is.na(df[, -1, with=F])
    calc_nonNA %*% t(calc_nonNA)
    },
  BASE_tcrossprod={
    calc_nonNA <- !is.na(df[, -1, with=F])
    tcrossprod(calc_nonNA)
  },
  times=10
  )

qplot(y=time, data=op, colour=expr) + scale_y_log10()

在此输入图像描述

Answer 2

I tried this out with random data of 25061x5862 and it quickly chewed up 50gb of ram (including swap space) and, as such, is way way way less memory efficient than using tcrossprod but if you have an obscene amount of memory then maybe (but probably not) this could be faster.

#generate cross columns for all matches
crossDT<-data.table(gene=rep(df1[,unique(gene)],length(df1[,unique(gene)])),gene2=rep(df1[,unique(gene)],each=length(df1[,unique(gene)])))
#create datatable with row for each combo
df2<-merge(df1,crossDT,by="gene")
setkey(df2,gene2)
setkey(df1,gene)
#make datatable with a set of P columns for each gene
df3<-df1[df2]
#find middle column and then make name vectors
pivotcol<-match("i.gene",names(df3))
names1<-names(df3)[2:(pivotcol-1)]
names2<-names(df3)[(pivotcol+1):ncol(df3)]
names3<-paste0("new",names1)
#make third set of P columns where the new value is False if either of the previous sets of P columns is NA
df3[,(names3):=lapply(1:length(names1),function(x) !any(is.na(c(get(names1[x]),get(names2[x]))))),by=c("gene","i.gene")]
#delete first sets of P columns
df3[,c(names1,names2):=NULL]
#sum up true columns
df3[,Sum:=rowSums(.SD),.SDcols=names3]
#delete set of P columns
df3[,(names3):=NULL]
#cast results to desired shape
dcast.data.table(df3,gene~i.gene,value.var='Sum')

Compute with data.table: how many 2x2 non NA values there are among the variables?

Question

2 answers

solution1
4 2015-07-09 09:42:28

solution2
3 2015-07-09 19:05:16

Compute with data.table: how many 2x2 non NA values there are among the variables?

Question

2 answers

solution1 4 2015-07-09 09:42:28

solution2 3 2015-07-09 19:05:16

solution1
4 2015-07-09 09:42:28

solution2
3 2015-07-09 19:05:16