[英]How to get the nth match of the elements between two vectors in R?
match
返回其第一个和第二个参数之间的第一个匹配位置:
match(c("a","c"), c("a", "a", "b", "c", "c", "c")) # 1 4
指定除第一个之外的匹配的最佳方法是什么? 例如,我们希望第二场比赛为"a"
,第三场比赛为"c"
(所以我们得到: 2 6
)。
更新:效率低下的解决方案会进行n次查找:
value_index_query <- data.frame(value = c("a", "c"), index = c(2, 3))
id <- c("a", "a", "b", "c", "c", "c")
apply(value_index_query, 1, function(value_index) {
value <- value_index[1]
index <- as.integer(value_index[2])
which(id == value)[index]
})
这也使用mapply通过哪个(。)[n]操作串联运行两列。
with(value_index_query,
mapply( function(target, nth) which(id==target)[nth],
target=value, nth=index) )
[1] 2 6
这是一个data.table解决方案,我们将id
向量与映射表连接起来。 然后我们可以使用.EACHI
进行分组,从每个组的.I
获取index
。
library(data.table)
## 'dti' would be your 'value_index_query' with the 'value' column renamed
dti <- data.table(id = c("a", "c"), index = c(2, 3))
## join it with 'id' and take 'index' by group
data.table(id)[dti, .I[index], by = .EACHI, on = "id"]$V1
# [1] 2 6
我们可以把它放到一个函数中:
viq <- function(id, value, index) {
dti <- data.table(id = value, index = index)
data.table(id)[dti, .I[index], by = .EACHI, on = "id"]$V1
}
id <- c("a", "a", "b", "c", "c", "c")
viq(id, c("a", "c"), 2:3)
# [1] 2 6
viq(id, c("a", "c"), c(2, 4))
# [1] 2 NA
viq(id, c("a", "b", "c"), c(2, 1, 4))
# [1] 2 3 NA
viq(id, c("a", "b", "c"), c(2, 1, 3))
# [1] 2 3 6
用grep
一次一个。
vec <- c("a", "a", "b", "c", "c", "c")
aa <-grep("a", vec)[2] #2nd
cc <-grep("c", vec)[3] #3rd
c(aa,cc)
#[1] 2 6
这是一种dplyr方式
library(dplyr)
test = data_frame(value = c("a","c"), order = c(2, 3))
original = data_frame(value = c("a", "a", "b", "c", "c", "c"))
original %>%
mutate(ID = 1:n()) %>%
right_join(test) %>%
group_by(value) %>%
slice(order %>% first)
那这个呢?:
mapply(function(x,y) x[[y]], x = sapply(v1, function(x) which(x == v2)), y = c(2,3))
a c
2 6
为了比较,一个(可能不是理想的,我还在学习)Rcpp解决方案与其他三个主要方法的一些时间安排。
library(Rcpp)
library(microbenchmark)
library(data.table)
library(dplyr)
foo_mapply <- function(value,id,index){
mapply( function(target, nth, id) which(id==target)[nth],
target=value, nth=index,MoreArgs = list(id = id))
}
foo_dt <- function(dti,id){
data.table(id)[dti, .I[index], by = .EACHI, on = "id"]$V1
}
foo_dplyr <- function(test,original){
original %>%
mutate(ID = 1:n()) %>%
right_join(test,by = "value") %>%
group_by(value) %>%
slice(order %>% first)
}
cppFunction('IntegerVector nmatch(CharacterVector value,CharacterVector id,IntegerVector index){
int nvalue = value.size();
int nid = id.size();
int completed = 0;
IntegerVector match_count(nvalue,0);
IntegerVector out(nvalue,IntegerVector::get_na());
for (int i = 0; i < nid; ++i){
for (int j = 0; j < nvalue; ++j){
if (value[j] == id[i]){
match_count[j] = match_count[j] + 1;
if (match_count[j] == index[j]){
out[j] = i + 1;
completed++;
}
}
}
if (completed == nvalue){
break;
}
}
return out;
}')
时间结果如下:
> #One with all matches relatively early
> set.seed(123)
> value <- c("a","b", "c")
> index <- c(150,50,500)
> id <- sample(letters[1:5],10000,replace = TRUE)
> dti <- data.table(id = value,index = index)
> test = data_frame(value = value, order = index)
> original = data_frame(value = id)
>
> microbenchmark(nmatch(value = value, id = id,index = index),
+ foo_mapply(value = value,id = id,index = index),
+ foo_dt(dti = dti,id = id),
+ foo_dplyr(test = test,original = original))
Unit: microseconds
expr min lq mean median uq max neval cld
nmatch(value = value, id = id, index = index) 118.326 121.9060 124.2930 122.8535 124.5040 167.713 100 a
foo_mapply(value = value, id = id, index = index) 863.281 873.1505 949.8326 878.8535 896.7795 2119.411 100 b
foo_dt(dti = dti, id = id) 1860.678 1927.0990 2038.5965 1985.2720 2082.7900 3761.116 100 c
foo_dplyr(test = test, original = original) 2862.143 2943.7280 3175.9202 2986.2385 3121.7685 4502.976 100 d
> #One with a match that forces us nearer the end of the list
> set.seed(123)
> value <- c("a","b", "c")
> index <- c(150,50,2000)
> id <- sample(letters[1:5],10000,replace = TRUE)
> dti <- data.table(id = value,index = index)
> test = data_frame(value = value, order = index)
> original = data_frame(value = id)
>
> microbenchmark(nmatch(value = value, id = id,index = index),
+ foo_mapply(value = value,id = id,index = index),
+ foo_dt(dti = dti,id = id),
+ foo_dplyr(test = test,original = original))
Unit: microseconds
expr min lq mean median uq max neval cld
nmatch(value = value, id = id, index = index) 469.208 473.4735 481.0698 475.1040 487.7145 560.031 100 a
foo_mapply(value = value, id = id, index = index) 861.797 872.6845 949.6749 882.5335 903.1255 2091.864 100 a
foo_dt(dti = dti, id = id) 1821.554 1924.5690 2022.2231 1977.5970 2082.6035 3300.399 100 b
foo_dplyr(test = test, original = original) 2875.626 2945.7560 3681.2624 2995.7900 3100.3235 53508.339 100 c
有了这个设置
set.seed(123)
id <- sample(letters[1:5], 10000, replace = TRUE)
value <- c("a", "b", "c")
index <- c(150, 50, 500)
索引然后拆分id
向量
index_by_id <- split(seq_along(id), id)
将值与id_by_value
的条目id_by_value
value_idx <- match(value, names(index_by_id))
选择每个匹配的第i个元素
mapply(`[`, index_by_id[value_idx], index)
并作为一个功能:
f1 <- function(id, value, index) {
index_by_id <- split(seq_along(id), id)
value_idx <- match(value, names(index_by_id))
mapply(`[`, index_by_id[value_idx], index)
}
当value
很长但有几个等级时,这会很快,例如,
f0 <- function(id, value, index)
mapply(function(target, nth) which(id==target)[nth], value, index)
viq <- function(id, value, index) {
dti <- data.table(id = value, index = index)
data.table(id)[dti, .I[index], by = .EACHI, on = "id"]$V1
}
和
> value <- rep(value, 100)
> identical(f0(id, value, index), f1(id, value, index))
[1] TRUE
> all.equal(f0(id, value, index), viq(id, value, index),
+ check.attributes=FALSE)
[1] TRUE
> microbenchmark(f0(id, value, index), f1(id, value, index),
+ viq(id, value, index))
Unit: milliseconds
expr min lq mean median uq
f0(id, value, index) 53.166878 54.909566 56.917717 55.336116 56.503741
f1(id, value, index) 1.682265 1.716843 1.883576 1.755070 1.831189
viq(id, value, index) 4.304148 4.381708 4.667590 4.656087 4.757184
max neval
99.621742 100
3.291769 100
6.590130 100
@ 42-答案的变体
mapply(
function(value, index) which(value == id)[index],
value = value_index_query$value,
index = value_index_query$index
)
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.