加入 r 中两个数据帧的重叠范围

Question

注意：此问题已作为“重复”关闭。 这里和这里提供的解决方案没有回答我的问题。 他们展示了当单个条目落在一个范围内时如何合并，我试图识别重叠范围并加入它们。 也许我的标题本来可以更好...

我有一个带有开始和结束时间（以秒为单位）的主数据集main_df 。 我想看看main_df中的时间范围是否在lookup_df的范围列表中，如果是，请从lookup_df中获取值。 此外，如果main_df在两个不同的查找范围内，请复制该行以便表示每个值。***

main_df <- tibble(start = c(30,124,161),
                end = c(80,152,185))

lookup_df <- tibble(start = c(34,73,126,141,174,221),
                       end = c(69,123,136,157,189,267),
                       value = c('a','b','b','b','b','a'))

# Do something here to get the following:

> final_df
# A tibble: 4 x 4
  start   end value notes                                      
  <dbl> <dbl> <chr> <chr>                                      
1    30    80 a     ""                                         
2    30    80 b     "Duplicate because it falls within a and b"
3   124   152 b     "Falls within two lookups but both are b"  
4   161   185 b     ""

***编辑：看看我构建问题的方式......

#Not actual code
left_join(main_df, lookup_df, by(some_range_join_function) %>% 
  add_rows(through_some_means)

而不是必须添加一个新行，我可以翻转我如何加入他们......

semi_join(lookup_df, main_df, by(some_range_join_function))

Answer 1

您可以进行一些逻辑比较，然后处理如果全部为'b' 、 'a'和'b'等情况会发生什么情况。通过这种方式，您可以轻松添加更多案例，例如两者都是'a' ，一个是'a' ，更多的是'b'你没有在 OP 中声明。 如果在rbind期间没有忽略匹配项，则该方法会产生NULL 。

f <- \(x, y) {
  w <- which((x[1] >= y[, 1] & x[1] <= y[, 2]) | (x[2] >= y[, 1] & x[1] <= y[, 2]))
  if (length(w) > 0) {
    d <- data.frame(t(x), value=cbind(y[w, 3]), notes='')
    if (length(w) >= 2) {
      if (all(d$value == 'b')) {
        d <- d[!duplicated(d$value), ]
        d$notes[1] <- 'both b'
      }
      else {
        d$notes[nrow(d)] <- 'a & b'
      }
    }
    d
  }
}

apply(main_df, 1, f, lookup_df, simplify=F) |> do.call(what=rbind)
#   start end value  notes
# 1    30  80     a       
# 2    30  80     b  a & b
# 3   124 152     b both b
# 4   161 185     b

数据：

main_df <- structure(list(start = c(2, 30, 124, 161), end = c(1, 80, 152, 
185)), row.names = c(NA, -4L), class = "data.frame")

lookup_df <- structure(list(start = c(34, 73, 126, 141, 174, 221), end = c(69, 
123, 136, 157, 189, 267), value = c("a", "b", "b", "b", "b", 
"a")), row.names = c(NA, -6L), class = "data.frame")

Answer 2

另一种选择是fuzzyjoin::interval_join ：

library(fuzzyjoin)
library(dplyr)

interval_join(main_df, lookup_df, by = c("start", "end"), mode = "inner") %>% 
  group_by(value, start.x, end.x) %>% 
  slice(1) %>% 
  select(start = start.x, end = end.x, value)

# A tibble: 4 × 3
# Groups:   value, start, end [4]
  start   end value
  <dbl> <dbl> <chr>
1    30    80 a    
2    30    80 b    
3   124   152 b    
4   161   185 b

Answer 3

为此，您可以使用foverlaps中的data.table 。

library(data.table)

setDT(main_df) # make it a data.table if needed
setDT(lookup_df) # make it a data.table if needed

setkey(main_df, start, end) # set the keys of 'y'

foverlaps(lookup_df, main_df, nomatch = NULL) # do the lookup

#    start end i.start i.end value
# 1:    30  80      34    69     a
# 2:    30  80      73   123     b
# 3:   124 152     126   136     b
# 4:   124 152     141   157     b
# 5:   161 185     174   189     b

或将清理后的结果作为最终结果（OP 的 final_df）

unique(foverlaps(lookup_df, main_df, nomatch = NULL)[, .(start, end, value)])

   start end value
1:    30  80     a
2:    30  80     b
3:   124 152     b
4:   161 185     b

Answer 4

基于powerjoin的可能解决方案：

library(tidyverse)
library(powerjoin)

power_left_join(
  main_df, lookup_df,
  by = ~ (.x$start <= .y$start & .x$end >= .y$end) |
    (.x$start >= .y$start & .x$start <= .y$end) | 
    (.x$start <= .y$start & .x$end >= .y$start), 
  keep = "left") %>% 
  distinct()

#> # A tibble: 4 x 3
#>   start   end value
#>   <dbl> <dbl> <chr>
#> 1    30    80 a    
#> 2    30    80 b    
#> 3   124   152 b    
#> 4   161   185 b

或使用tidyr::crossing ：

library(tidyverse)

crossing(main_df, lookup_df,
        .name_repair = ~ c("start", "end", "start2", "end2", "value")) %>% 
  filter((start <= start2 & end >= end2) |
         (start >= start2 & start <= end2) | (start <= start2 & end >= start2)) %>% 
  select(-start2, -end2) %>% 
  distinct()

#> # A tibble: 4 x 3
#>   start   end value
#>   <dbl> <dbl> <chr>
#> 1    30    80 a    
#> 2    30    80 b    
#> 3   124   152 b    
#> 4   161   185 b

Answer 5

您可以使用fuzzyjoin包通过fuzzyjoin::interval_*_join()函数基于间隔进行连接。

我将使用内部联接，因为如果您像建议的那样使用半联接，您将失去值 col 并仅获得 3 行。

library(tidyverse)
library(fuzzyjoin)

fuzzyjoin::interval_inner_join(lookup_df, main_df, by = c("start", "end"), type = "any")
#> # A tibble: 5 × 5
#>   start.x end.x value start.y end.y
#>     <dbl> <dbl> <chr>   <dbl> <dbl>
#> 1      34    69 a          30    80
#> 2      73   123 b          30    80
#> 3     126   136 b         124   152
#> 4     141   157 b         124   152
#> 5     174   189 b         161   185

如您所见， fuzzy_inner_join()保留了两个表中的 by cols，因为它们在模糊连接中不同。 此外，对于main_df中与lookup_df中的多个案例匹配的那些案例，我们仍然有单独的行。 因此，我们对连接表进行了一些清理：

interval_inner_join(lookup_df, main_df, 
                    by = c("start", "end"), 
                    type = "any") |> 
  select(-ends_with(".x")) |> # remove lookup interval cols
  distinct() |> # remove duplicate
  rename_with(str_remove, ends_with(".y"), "\\.y") # remove suffixes from col names
#> # A tibble: 4 × 3
#>   value start   end
#>   <chr> <dbl> <dbl>
#> 1 a        30    80
#> 2 b        30    80
#> 3 b       124   152
#> 4 b       161   185

最后，澄清一下术语：在您的问题中，您声明要根据main_df的间隔在lookup_df的间隔内加入。 这可以通过在interval_*_join()中使用type = "within"来实现。 但是根据您提供的示例，您似乎希望根据任何重叠加入。 这可以使用type = "any"来完成，但它是默认值，因此您不需要指定它。

加入 r 中两个数据帧的重叠范围

问题描述

5 个解决方案

解决方案1
1 2022-05-11 06:57:25

解决方案2
0 2022-05-11 07:42:59

解决方案3
0 2022-05-11 07:54:34

解决方案4
0 2022-05-11 08:20:12

解决方案5
0 2022-05-11 09:26:56

加入 r 中两个数据帧的重叠范围

问题描述

5 个解决方案

解决方案1 1 2022-05-11 06:57:25

解决方案2 0 2022-05-11 07:42:59

解决方案3 0 2022-05-11 07:54:34

解决方案4 0 2022-05-11 08:20:12

解决方案5 0 2022-05-11 09:26:56

解决方案1
1 2022-05-11 06:57:25

解决方案2
0 2022-05-11 07:42:59

解决方案3
0 2022-05-11 07:54:34

解决方案4
0 2022-05-11 08:20:12

解决方案5
0 2022-05-11 09:26:56