[英]Find rows with overlapping ranges
我有 3 個大數據框,如下所示:
library(tibble)
df1 <- tibble(peak=c("peak1","peak2","peak3"),
coord1=c(100,500,1000),
coord2=c(250,700,1250))
df2 <- tibble(peak=c("peak5","peak6","peak7"),
coord1=c(120,280,900),
coord2=c(300,400,1850))
df3 <- tibble(peak=c("peak8","peak9","peak10"),
coord1=c(900,3000,5600),
coord2=c(2000,3400,5850))
df1
#> # A tibble: 3 × 3
#> peak coord1 coord2
#> <chr> <dbl> <dbl>
#> 1 peak1 100 250
#> 2 peak2 500 700
#> 3 peak3 1000 1250
df2
#> # A tibble: 3 × 3
#> peak coord1 coord2
#> <chr> <dbl> <dbl>
#> 1 peak5 120 300
#> 2 peak6 280 400
#> 3 peak7 900 1850
df3
#> # A tibble: 3 × 3
#> peak coord1 coord2
#> <chr> <dbl> <dbl>
#> 1 peak8 900 2000
#> 2 peak9 3000 3400
#> 3 peak10 5600 5850
我是 R 的新手,我試圖找到每個數據幀唯一的坐標 (coord1, coord2) 內的重疊區域、兩個數據幀之間的重疊以及所有數據幀內的重疊。
我想要這些數據框作為輸出。 目前我很難找到如何在 R、dplyr 中指定我想根據重疊范圍進行過濾。 我缺少一個命令
獨特的是這些峰的范圍不與其他數據幀的峰范圍重疊
> unique
peak coord1 coord2
peak6 280 400
peak9 3000 3400
peak10 5600 5850
df1-df2 之間共有
>df1df2
peak coord1 coord2
peak1 100 250
peak5 120 300
peak3 1000 1250
peak7 900 1850
df1-df3 之間共有
peak coord1 coord2
peak3 1000 1250
peak8 900 2000
然后在 df1-df2-df3 之間通用
老實說,我不明白你搜索的最終目標是什么。 無論如何,有一個解決方案使用ivs
方法和 ivs 包中的函數來檢查向量的間隔。 這不是一個優雅的解決方案,它沒有考慮同一數據幀內的重疊向量。
# load packages
library(tidyverse)
library(ivs)
df1 <- tibble(peak=c("peak1","peak2","peak3"),
coord1 = c(100, 500, 1000),
coord2 = c(250, 700, 1250))
df2 <- tibble(peak=c("peak5","peak6","peak7"),
coord1 = c(120, 280, 900),
coord2 = c(300, 400, 1850))
df3 <- tibble(peak=c("peak8","peak9","peak10"),
coord1 = c(900, 3000, 5600),
coord2 = c(2000, 3400, 5850))
iv_overlaps
來創建間隔check_df1_df2 <- df1 %>%
mutate(any_overlap = iv_overlaps(range, df2$range),
check = "df1-df2")
check_df1_df3 <- df1 %>%
mutate(any_overlap = iv_overlaps(range, df3$range),
check = "df1-df3")
check_df2_df1 <- df2 %>%
mutate(any_overlap = iv_overlaps(range, df1$range),
check = "df2-df1")
check_df2_df3 <- df2 %>%
mutate(any_overlap = iv_overlaps(range, df3$range),
check = "df2-df3")
check_df3_df1 <- df3 %>%
mutate(any_overlap = iv_overlaps(range, df1$range),
check = "df3-df1")
check_df3_df2 <- df3 %>%
mutate(any_overlap = iv_overlaps(range, df2$range),
check = "df3-df2")
final_conclusion <- bind_rows(check_df1_df2, check_df1_df3, check_df2_df1, check_df2_df3, check_df3_df1, check_df3_df2, .id = "df_check") %>%
group_by(peak) %>%
mutate(overlapping_sum = sum(any_overlap))
overlapping <- final_conclusion %>%
filter(overlapping_sum > 0) %>%
pivot_wider(id_cols = peak, names_from = check, values_from = range)
> overlapping
# A tibble: 5 × 7
# Groups: peak [5]
peak `df1-df2` `df1-df3` `df2-df1` `df2-df3` `df3-df1` `df3-df2`
<chr> <iv<dbl>> <iv<dbl>> <iv<dbl>> <iv<dbl>> <iv<dbl>> <iv<dbl>>
1 peak1 [100, 250) [100, 250) [NA, NA) [NA, NA) [NA, NA) [NA, NA)
2 peak3 [1000, 1250) [1000, 1250) [NA, NA) [NA, NA) [NA, NA) [NA, NA)
3 peak5 [NA, NA) [NA, NA) [120, 300) [120, 300) [NA, NA) [NA, NA)
4 peak7 [NA, NA) [NA, NA) [900, 1850) [900, 1850) [NA, NA) [NA, NA)
5 peak8 [NA, NA) [NA, NA) [NA, NA) [NA, NA) [900, 2000) [900, 2000)
not_overlapping <- final_conclusion %>%
filter(overlapping_sum == 0) %>%
pivot_wider(id_cols = peak, names_from = check, values_from = range)
> not_overlapping
# A tibble: 4 × 7
# Groups: peak [4]
peak `df1-df2` `df1-df3` `df2-df1` `df2-df3` `df3-df1` `df3-df2`
<chr> <iv<dbl>> <iv<dbl>> <iv<dbl>> <iv<dbl>> <iv<dbl>> <iv<dbl>>
1 peak2 [500, 700) [500, 700) [NA, NA) [NA, NA) [NA, NA) [NA, NA)
2 peak6 [NA, NA) [NA, NA) [280, 400) [280, 400) [NA, NA) [NA, NA)
3 peak9 [NA, NA) [NA, NA) [NA, NA) [NA, NA) [3000, 3400) [3000, 3400)
4 peak10 [NA, NA) [NA, NA) [NA, NA) [NA, NA) [5600, 5850) [5600, 5850)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.