簡體   English   中英

如何獲得R中兩個向量之間元素的第n個匹配?

[英]How to get the nth match of the elements between two vectors in R?

match返回其第一個和第二個參數之間的第一個匹配位置:

match(c("a","c"), c("a", "a", "b", "c", "c", "c")) # 1 4

指定除第一個之外的匹配的最佳方法是什么? 例如,我們希望第二場比賽為"a" ,第三場比賽為"c" (所以我們得到: 2 6 )。

更新:效率低下的解決方案會進行n次查找:

value_index_query <- data.frame(value = c("a", "c"), index = c(2, 3))
id <-  c("a", "a", "b", "c", "c", "c")
apply(value_index_query, 1, function(value_index) {
  value <- value_index[1]
  index <- as.integer(value_index[2])
  which(id == value)[index]
})

這也使用mapply通過哪個(。)[n]操作串聯運行兩列。

with(value_index_query,  
     mapply( function(target, nth) which(id==target)[nth], 
               target=value, nth=index) )
[1] 2 6

這是一個data.table解決方案,我們將id向量與映射表連接起來。 然后我們可以使用.EACHI進行分組,從每個組的.I獲取index

library(data.table)
## 'dti' would be your 'value_index_query' with the 'value' column renamed
dti <- data.table(id = c("a", "c"), index = c(2, 3))
## join it with 'id' and take 'index' by group
data.table(id)[dti, .I[index], by = .EACHI, on = "id"]$V1
# [1] 2 6

我們可以把它放到一個函數中:

viq <- function(id, value, index) {
    dti <- data.table(id = value, index = index)
    data.table(id)[dti, .I[index], by = .EACHI, on = "id"]$V1
}

id <- c("a", "a", "b", "c", "c", "c")

viq(id, c("a", "c"), 2:3)
# [1] 2 6
viq(id, c("a", "c"), c(2, 4))
# [1]  2 NA
viq(id, c("a", "b", "c"), c(2, 1, 4))
# [1]  2  3 NA
viq(id, c("a", "b", "c"), c(2, 1, 3))
# [1] 2 3 6

grep一次一個。

vec <- c("a", "a", "b", "c", "c", "c")
aa <-grep("a", vec)[2] #2nd
cc <-grep("c", vec)[3] #3rd
c(aa,cc)
#[1] 2 6

這是一種dplyr方式

library(dplyr)

test = data_frame(value = c("a","c"), order = c(2, 3))
original = data_frame(value =  c("a", "a", "b", "c", "c", "c"))

original %>%
  mutate(ID = 1:n()) %>%
  right_join(test) %>%
  group_by(value) %>%
  slice(order %>% first)

那這個呢?:

mapply(function(x,y) x[[y]], x = sapply(v1, function(x) which(x == v2)), y = c(2,3))
a c 
2 6 

為了比較,一個(可能不是理想的,我還在學習)Rcpp解決方案與其他三個主要方法的一些時間安排。

library(Rcpp)
library(microbenchmark)
library(data.table)
library(dplyr)

foo_mapply <- function(value,id,index){
    mapply( function(target, nth, id) which(id==target)[nth], 
                            target=value, nth=index,MoreArgs = list(id = id))
}

foo_dt <- function(dti,id){
    data.table(id)[dti, .I[index], by = .EACHI, on = "id"]$V1
}

foo_dplyr <- function(test,original){
    original %>%
        mutate(ID = 1:n()) %>%
        right_join(test,by = "value") %>%
        group_by(value) %>%
        slice(order %>% first)
}

cppFunction('IntegerVector nmatch(CharacterVector value,CharacterVector id,IntegerVector index){
                        int nvalue = value.size();
            int nid = id.size();
            int completed = 0;
            IntegerVector match_count(nvalue,0);
            IntegerVector out(nvalue,IntegerVector::get_na());

            for (int i = 0; i < nid; ++i){
              for (int j = 0; j < nvalue; ++j){
                if (value[j] == id[i]){
                  match_count[j] = match_count[j] + 1;
                  if (match_count[j] == index[j]){
                    out[j] = i + 1;
                    completed++;
                  }
                }
              }
              if (completed == nvalue){
                break;
              }
            }
            return out;
                        }')

時間結果如下:

> #One with all matches relatively early
> set.seed(123)
> value <- c("a","b", "c")
> index <- c(150,50,500)
> id <-  sample(letters[1:5],10000,replace = TRUE)
> dti <- data.table(id = value,index = index)
> test = data_frame(value = value, order = index)
> original = data_frame(value =  id)
> 
> microbenchmark(nmatch(value = value, id = id,index = index),
+                            foo_mapply(value = value,id = id,index = index),
+                            foo_dt(dti = dti,id = id),
+                            foo_dplyr(test = test,original = original))
Unit: microseconds
                                              expr      min        lq      mean    median        uq      max neval  cld
     nmatch(value = value, id = id, index = index)  118.326  121.9060  124.2930  122.8535  124.5040  167.713   100 a   
 foo_mapply(value = value, id = id, index = index)  863.281  873.1505  949.8326  878.8535  896.7795 2119.411   100  b  
                        foo_dt(dti = dti, id = id) 1860.678 1927.0990 2038.5965 1985.2720 2082.7900 3761.116   100   c 
       foo_dplyr(test = test, original = original) 2862.143 2943.7280 3175.9202 2986.2385 3121.7685 4502.976   100    d

> #One with a match that forces us nearer the end of the list
> set.seed(123)
> value <- c("a","b", "c")
> index <- c(150,50,2000)
> id <-  sample(letters[1:5],10000,replace = TRUE)
> dti <- data.table(id = value,index = index)
> test = data_frame(value = value, order = index)
> original = data_frame(value =  id)
> 
> microbenchmark(nmatch(value = value, id = id,index = index),
+                            foo_mapply(value = value,id = id,index = index),
+                            foo_dt(dti = dti,id = id),
+                            foo_dplyr(test = test,original = original))
Unit: microseconds
                                              expr      min        lq      mean    median        uq       max neval cld
     nmatch(value = value, id = id, index = index)  469.208  473.4735  481.0698  475.1040  487.7145   560.031   100 a  
 foo_mapply(value = value, id = id, index = index)  861.797  872.6845  949.6749  882.5335  903.1255  2091.864   100 a  
                        foo_dt(dti = dti, id = id) 1821.554 1924.5690 2022.2231 1977.5970 2082.6035  3300.399   100  b 
       foo_dplyr(test = test, original = original) 2875.626 2945.7560 3681.2624 2995.7900 3100.3235 53508.339   100   c

有了這個設置

set.seed(123)
id <-  sample(letters[1:5], 10000, replace = TRUE)
value <- c("a", "b", "c")
index <- c(150, 50, 500)

索引然后拆分id向量

index_by_id <- split(seq_along(id), id)

將值與id_by_value的條目id_by_value

value_idx <- match(value, names(index_by_id))

選擇每個匹配的第i個元素

mapply(`[`, index_by_id[value_idx], index)

並作為一個功能:

f1 <- function(id, value, index) {
    index_by_id <- split(seq_along(id), id)
    value_idx <- match(value, names(index_by_id))
    mapply(`[`, index_by_id[value_idx], index)
}

value很長但有幾個等級時,這會很快,例如,

f0 <- function(id, value, index)
    mapply(function(target, nth) which(id==target)[nth], value, index)

viq <- function(id, value, index) {
    dti <- data.table(id = value, index = index)
    data.table(id)[dti, .I[index], by = .EACHI, on = "id"]$V1
}

> value <- rep(value, 100)
> identical(f0(id, value, index), f1(id, value, index))
[1] TRUE
> all.equal(f0(id, value, index), viq(id, value, index),
+           check.attributes=FALSE)
[1] TRUE
> microbenchmark(f0(id, value, index), f1(id, value, index),
+                viq(id, value, index))
Unit: milliseconds
                  expr       min        lq      mean    median        uq
  f0(id, value, index) 53.166878 54.909566 56.917717 55.336116 56.503741
  f1(id, value, index)  1.682265  1.716843  1.883576  1.755070  1.831189
 viq(id, value, index)  4.304148  4.381708  4.667590  4.656087  4.757184
       max neval
 99.621742   100
  3.291769   100
  6.590130   100

@ 42-答案的變體

mapply(
  function(value, index) which(value == id)[index], 
  value = value_index_query$value, 
  index = value_index_query$index
)

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM