简体   繁体   中英

Count occurrence of nodes in vertex of open triangles using igraph in R

In a network of passes among basketball players I want to:

  1. Detect open triangles in the network
  2. Count the number of unique players in brokering position (A passes to B & C; B & C don't pass to each other; A is brokering)
  3. Count the number of times these player broker an open triangle

Following this question Extracting Open Triangles in R Igraph (Network Analysis) we can do the following:

library(igraph)

set.seed(1234)
G <- sample_gnm(10, 15)

G
IGRAPH 72f8e6a U--- 10 15 -- Erdos renyi (gnm) graph
+ attr: name (g/c), type (g/c), loops (g/l), m (g/n)
+ edges from 72f8e6a:
[1] 1-- 3 1-- 4 3-- 4 1-- 5 3-- 5 6-- 7 3-- 8 4-- 8 6-- 8 7-- 8 2-- 9 6-- 9 7-- 9 4--10 9--10
 
plot(G)

G

Find the open triangles:

openTriList <- unique(do.call(c, lapply(as_ids(V(G)), function(v) {
    do.call(c, lapply(as_ids(neighbors(G, v)), function(v1) {
    v2 <- as_ids(neighbors(G, v1))
    v2 <- v2[shortest.paths(G, v, v2) == 2]

    if(length(v2) != 0) {
        lapply(v2, function(vv2) { c(v, v1, vv2)[order(c(v, v1, vv2))] })
    } else { list() }
    }))
})))

The results are correct:

do.call(rbind, openTriList)
  [,1] [,2] [,3]
  [1,]    1    3    8
  [2,]    1    4    8
  [3,]    1    4   10
  [4,]    2    6    9
  [5,]    2    7    9
  [6,]    2    9   10
  [7,]    3    4   10
  [8,]    3    6    8
  [9,]    3    7    8
  [10,]    1    4    5
  [11,]    3    4    5
  [12,]    4    6    8
  [13,]    4    7    8
  [14,]    4    9   10
  [15,]    3    5    8
  [16,]    6    9   10
  [17,]    7    9   10
  [18,]    4    8   10
  [19,]    6    8    9
  [20,]    7    8    9

How do we find the players that are brokers?

  • Player 2 is in this list because it is part of an open triangle, but is not a broker. We ignore this player.

And how can we efficiently count the number of times these player broker an open triangle?

  • Player 9 is brokering 5 open triangles.

[The real data holds millions of passes and several thousands of players. So performance is an important aspect. Using combn results in extremely long computational times. Are there faster ways of doing this? Perhaps getting the adjacency graph to build a sparse matrix and converting it into a data.table object for joining by neighbours? See this link . ]

Update

If you want to speed up, below is an option using for loops + combn to define a user functions f , which ouput a list including both the openTriList and occurCnt ( thank @minem's feedback as well for performance improvement ):

f <- function(G) {
  dmat <- as_adj(G, sparse = FALSE)
  resLst <- c()
  for (broker in 1:nrow(dmat)) {
    k <- which(dmat[broker, ] == 1)
    if (length(k) > 1) {
      inds <- t(combn(k, 2))
      resLst[[broker]] <- subset(cbind(broker, inds), dmat[inds] == 0)
    }
  }
  resLst <- do.call(rbind, resLst)
  resCnt <- table(resLst[, "broker"])
  list(openTriLst = resLst, occurCnt = resCnt)
}

and you will see that they can achieve the desired output

> set.seed(1234)

> G <- sample_gnm(10, 15)

> f(G)
$openTriLst
      broker
 [1,]      1 4  5
 [2,]      3 1  8
 [3,]      3 4  5
 [4,]      3 5  8
 [5,]      4 1  8
 [6,]      4 1 10
 [7,]      4 3 10
 [8,]      4 8 10
 [9,]      6 8  9
[10,]      7 8  9
[11,]      8 3  6
[12,]      8 3  7
[13,]      8 4  6
[14,]      8 4  7
[15,]      9 2  6
[16,]      9 2  7
[17,]      9 2 10
[18,]      9 6 10
[19,]      9 7 10
[20,]     10 4  9

$occurCnt

 1  3  4  6  7  8  9 10
 1  3  4  1  1  4  5  1

and the speed is remarkably improved than my previous answer. You can also compare it with the answer by @minem .

> set.seed(1234)

> G1 <- sample_gnm(1000, 4000)

> system.time(f(G1))
   user  system elapsed 
   0.07    0.00    0.08

> G2 <- sample_gnm(10000, 40000)

> system.time(f(G2))
   user  system elapsed 
   2.46    0.16    2.62

Previous Answer

You can try the code below using combn + are_ajdacent , eg,

G <- sample_gnm(10, 15) %>%
  get.data.frame() %>%
  graph_from_data_frame(directed = FALSE)

openTriList <- do.call(
  rbind,
  sapply(
    names(V(G)),
    function(v) {
      nbs <- names(neighbors(G, v))
      if (length(nbs) > 1) {
        do.call(rbind, Filter(length, combn(nbs, 2, FUN = function(x) {
          if (!are_adjacent(G, x[1], x[2])) {
            sort(as.numeric(c(v, x)))
          }
        }, simplify = FALSE)))
      }
    }
  )
)

occurCount <- na.omit(
  sapply(names(V(G)), function(v) {
    nbs <- names(neighbors(G, v))
    ifelse(length(nbs) > 1,
      sum(!combn(nbs,
        2,
        FUN = function(x) are_adjacent(G, x[1], x[2])
      )),
      NA
    )
  })
)

and you will get named vector

> openTriList
      [,1] [,2] [,3]
 [1,]    1    4    5
 [2,]    1    3    8
 [3,]    3    4    5
 [4,]    3    5    8
 [5,]    6    8    9
 [6,]    1    4    8
 [7,]    1    4   10
 [8,]    3    4   10
 [9,]    4    8   10
[10,]    7    8    9
[11,]    2    6    9
[12,]    6    9   10
[13,]    2    7    9
[14,]    7    9   10
[15,]    2    9   10
[16,]    3    6    8
[17,]    3    7    8
[18,]    4    6    8
[19,]    4    7    8
[20,]    4    9   10

> occurCount
 1  3  6  4  7  9  5  8 10 
 1  3  1  4  1  5  0  4  1
attr(,"na.action")
2
6
attr(,"class")
[1] "omit"

I reworked openTriList calculation by Thomas:

require(data.table)

v2 <- function(a) {
  n <- names(V(a))
  d <- as.data.table(a %>% get.data.frame())
  d <- as.data.table(lapply(d, as.numeric))
  k1 <- d[[1]]
  k2 <- d[[2]]
  ks <- k1 + k2
  # v <- n[[1]] # for testing
  xx <- sapply(n, function(v) {
    nbs <- as.numeric(names(neighbors(a, v)))
    vn <- as.numeric(v)
    
    if (length(nbs) > 1) {
      i2 <- combn(nbs, 2, simplify = F)
      
      # reduce test vectors:
      ss <- sapply(i2, sum)
      ii <- ks %in% ss
      kk1 <- k1[ii]
      kk2 <- k2[ii]
      
      # reduce test vectors 2:
      i <- (kk1 %in% nbs) & (kk2 %in% nbs)
      kk1 <- kk1[i]
      kk2 <- kk2[i]
      
      # x <- i2[[1]] # for testing
      i3 <- lapply(i2, function(x) {
        q1 <- kk1 == x[1]
        q2 <- kk2 == x[2]
        zz <- q1 & q2
        r2 <- !any(zz)
        if (is.null(r2) || r2) {
          rs <- c(vn, x)
          rs <- .Internal(sort(rs, decreasing = F)) # less overhead
          rs
        }
      })
      s <- i3[lengths(i3) > 0]
      do.call(rbind, s)
      }
    }
  )
  do.call(rbind, xx)
}

openTriList <- v2(G)

This should be significantly faster. combn isnt slow. Slow is are_adjacent . Thomas code is written quite complicated, its hard to debug and find possible slowdowns... Test:

set.seed(1)
G <- sample_gnm(1000, 4000)
G <- get.data.frame(G) %>% graph_from_data_frame(directed = FALSE)
system.time(r1 <- v1(G)) # 12.00 sec
system.time(r2 <- v2(G)) # 0.99 sec
all.equal(r1, r2) # TRUE

Update

fun_openTriList2 <- function(G) {
  dmat <- as_adj(G, sparse = FALSE)
  res <- list()
  for (i in 1:nrow(dmat)) {
    inds <- which(dmat[i, ] == 1)
    if (length(inds) > 1) {
      r <- combn(inds, 2,
                 FUN = function(x) {
                   if (dmat[x[1], x[2]] == 0) {
                     .Internal(sort(c(i, x), decreasing = F))
                   }
                 },
                 simplify = F
      )
      res[[i]] <- do.call(rbind, r)
    }
  }
  res <- do.call(rbind, res)
  res
}

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM