简体   繁体   中英

How to get the nth match of the elements between two vectors in R?

match returns the position of first matches between its first and second arguments:

match(c("a","c"), c("a", "a", "b", "c", "c", "c")) # 1 4

What's the best way to specify matches other than the first? For example, that we want the 2nd match for "a" and the 3rd for "c" (so we'd get: 2 6 ).

Update: the inefficient solution does n lookups:

value_index_query <- data.frame(value = c("a", "c"), index = c(2, 3))
id <-  c("a", "a", "b", "c", "c", "c")
apply(value_index_query, 1, function(value_index) {
  value <- value_index[1]
  index <- as.integer(value_index[2])
  which(id == value)[index]
})

This also uses mapply to run the two columns in tandem through the which(.)[n] operation.

with(value_index_query,  
     mapply( function(target, nth) which(id==target)[nth], 
               target=value, nth=index) )
[1] 2 6

Here is a data.table solution where we join the id vector with a mapping table. Then we can use .EACHI for the grouping, grabbing the index from .I for each group.

library(data.table)
## 'dti' would be your 'value_index_query' with the 'value' column renamed
dti <- data.table(id = c("a", "c"), index = c(2, 3))
## join it with 'id' and take 'index' by group
data.table(id)[dti, .I[index], by = .EACHI, on = "id"]$V1
# [1] 2 6

We can put that into a function:

viq <- function(id, value, index) {
    dti <- data.table(id = value, index = index)
    data.table(id)[dti, .I[index], by = .EACHI, on = "id"]$V1
}

id <- c("a", "a", "b", "c", "c", "c")

viq(id, c("a", "c"), 2:3)
# [1] 2 6
viq(id, c("a", "c"), c(2, 4))
# [1]  2 NA
viq(id, c("a", "b", "c"), c(2, 1, 4))
# [1]  2  3 NA
viq(id, c("a", "b", "c"), c(2, 1, 3))
# [1] 2 3 6

One at a time with grep .

vec <- c("a", "a", "b", "c", "c", "c")
aa <-grep("a", vec)[2] #2nd
cc <-grep("c", vec)[3] #3rd
c(aa,cc)
#[1] 2 6

Here is a dplyr way

library(dplyr)

test = data_frame(value = c("a","c"), order = c(2, 3))
original = data_frame(value =  c("a", "a", "b", "c", "c", "c"))

original %>%
  mutate(ID = 1:n()) %>%
  right_join(test) %>%
  group_by(value) %>%
  slice(order %>% first)

What about this?:

mapply(function(x,y) x[[y]], x = sapply(v1, function(x) which(x == v2)), y = c(2,3))
a c 
2 6 

For comparison, a (probably not ideal, I'm still learning) Rcpp solution with some timings with the other three major approaches.

library(Rcpp)
library(microbenchmark)
library(data.table)
library(dplyr)

foo_mapply <- function(value,id,index){
    mapply( function(target, nth, id) which(id==target)[nth], 
                            target=value, nth=index,MoreArgs = list(id = id))
}

foo_dt <- function(dti,id){
    data.table(id)[dti, .I[index], by = .EACHI, on = "id"]$V1
}

foo_dplyr <- function(test,original){
    original %>%
        mutate(ID = 1:n()) %>%
        right_join(test,by = "value") %>%
        group_by(value) %>%
        slice(order %>% first)
}

cppFunction('IntegerVector nmatch(CharacterVector value,CharacterVector id,IntegerVector index){
                        int nvalue = value.size();
            int nid = id.size();
            int completed = 0;
            IntegerVector match_count(nvalue,0);
            IntegerVector out(nvalue,IntegerVector::get_na());

            for (int i = 0; i < nid; ++i){
              for (int j = 0; j < nvalue; ++j){
                if (value[j] == id[i]){
                  match_count[j] = match_count[j] + 1;
                  if (match_count[j] == index[j]){
                    out[j] = i + 1;
                    completed++;
                  }
                }
              }
              if (completed == nvalue){
                break;
              }
            }
            return out;
                        }')

And the timing results:

> #One with all matches relatively early
> set.seed(123)
> value <- c("a","b", "c")
> index <- c(150,50,500)
> id <-  sample(letters[1:5],10000,replace = TRUE)
> dti <- data.table(id = value,index = index)
> test = data_frame(value = value, order = index)
> original = data_frame(value =  id)
> 
> microbenchmark(nmatch(value = value, id = id,index = index),
+                            foo_mapply(value = value,id = id,index = index),
+                            foo_dt(dti = dti,id = id),
+                            foo_dplyr(test = test,original = original))
Unit: microseconds
                                              expr      min        lq      mean    median        uq      max neval  cld
     nmatch(value = value, id = id, index = index)  118.326  121.9060  124.2930  122.8535  124.5040  167.713   100 a   
 foo_mapply(value = value, id = id, index = index)  863.281  873.1505  949.8326  878.8535  896.7795 2119.411   100  b  
                        foo_dt(dti = dti, id = id) 1860.678 1927.0990 2038.5965 1985.2720 2082.7900 3761.116   100   c 
       foo_dplyr(test = test, original = original) 2862.143 2943.7280 3175.9202 2986.2385 3121.7685 4502.976   100    d

> #One with a match that forces us nearer the end of the list
> set.seed(123)
> value <- c("a","b", "c")
> index <- c(150,50,2000)
> id <-  sample(letters[1:5],10000,replace = TRUE)
> dti <- data.table(id = value,index = index)
> test = data_frame(value = value, order = index)
> original = data_frame(value =  id)
> 
> microbenchmark(nmatch(value = value, id = id,index = index),
+                            foo_mapply(value = value,id = id,index = index),
+                            foo_dt(dti = dti,id = id),
+                            foo_dplyr(test = test,original = original))
Unit: microseconds
                                              expr      min        lq      mean    median        uq       max neval cld
     nmatch(value = value, id = id, index = index)  469.208  473.4735  481.0698  475.1040  487.7145   560.031   100 a  
 foo_mapply(value = value, id = id, index = index)  861.797  872.6845  949.6749  882.5335  903.1255  2091.864   100 a  
                        foo_dt(dti = dti, id = id) 1821.554 1924.5690 2022.2231 1977.5970 2082.6035  3300.399   100  b 
       foo_dplyr(test = test, original = original) 2875.626 2945.7560 3681.2624 2995.7900 3100.3235 53508.339   100   c

With this set-up

set.seed(123)
id <-  sample(letters[1:5], 10000, replace = TRUE)
value <- c("a", "b", "c")
index <- c(150, 50, 500)

Index the and then split the id vector

index_by_id <- split(seq_along(id), id)

Match the values to their entries in id_by_value

value_idx <- match(value, names(index_by_id))

Select the ith element of each match

mapply(`[`, index_by_id[value_idx], index)

And as a function:

f1 <- function(id, value, index) {
    index_by_id <- split(seq_along(id), id)
    value_idx <- match(value, names(index_by_id))
    mapply(`[`, index_by_id[value_idx], index)
}

This will be fast when value is long but with a few levels, eg,

f0 <- function(id, value, index)
    mapply(function(target, nth) which(id==target)[nth], value, index)

viq <- function(id, value, index) {
    dti <- data.table(id = value, index = index)
    data.table(id)[dti, .I[index], by = .EACHI, on = "id"]$V1
}

and

> value <- rep(value, 100)
> identical(f0(id, value, index), f1(id, value, index))
[1] TRUE
> all.equal(f0(id, value, index), viq(id, value, index),
+           check.attributes=FALSE)
[1] TRUE
> microbenchmark(f0(id, value, index), f1(id, value, index),
+                viq(id, value, index))
Unit: milliseconds
                  expr       min        lq      mean    median        uq
  f0(id, value, index) 53.166878 54.909566 56.917717 55.336116 56.503741
  f1(id, value, index)  1.682265  1.716843  1.883576  1.755070  1.831189
 viq(id, value, index)  4.304148  4.381708  4.667590  4.656087  4.757184
       max neval
 99.621742   100
  3.291769   100
  6.590130   100

A variation on @42- answer

mapply(
  function(value, index) which(value == id)[index], 
  value = value_index_query$value, 
  index = value_index_query$index
)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM