[英]Filtering rows with group by and matching different strings in multiple columns
我有一個包含各種變量的數據框。 我的目標是從根本上過濾掉 match1 和 match2 不符合我的條件但對其他變量進行分組的行。 例如,在下面的數據框中,我想按 ID、名稱、所有變量、時間和 time_unit 進行分組。 我創建了一個 n 列,以更清楚地顯示預期的 output 與數據之間的差異。
當按這些分組時,您可以看到 match1 和 match2 列中存在不正確的重復項,即在行 n = 7,10,11,13,因為 match1 和 match2 列不相同,但其他列相同。 即使其中一個匹配列包含 NA,我也想包括行。 注意:第 9 行我希望 group1 也匹配 group3。
我嘗試了不同的方法,但在網上找不到類似的東西。 我如何匹配但也創建一種關於允許匹配的字典/列表? 也就是說,如果 match1 包含 group1 並且 match2 包含 group2 但也包含 group3,我想保留行,反之亦然。 我的真實數據框包含更多行,並且我更改了數據。
dataframe:
ID name var1 var2 var3 var4 var5 match1 time time_unit match2 n
1: 1 name1 trt1 Flexible 10.0 40 mg Group1 6 Weeks <NA> 1
2: 1 name1 trt2 Flexible NA NA <NA> Group1 6 Weeks Group1 2
3: 1 name1 trt3 Flexible 12.5 50 mg Group1 8 Weeks <NA> 3
4: 1 name1 trt1 Flexible 10.0 40 mg Group1 8 Weeks <NA> 4
5: 1 name1 trt2 Flexible NA NA <NA> Group1 8 Weeks Group1 5
6: 2 name2 trt4 Fixed 10.0 10 mg Group1 0 weeks Group1 6
7: 2 name2 trt4 Fixed 10.0 10 mg Group1 0 weeks Group2 7
8: 2 name2 trt5 Fixed 20.0 20 mg Group1 0 weeks Group1 8
9: 2 name2 trt5 Fixed 20.0 20 mg Group1 0 weeks Group3 9
10: 2 name2 trt5 Fixed 20.0 20 mg Group1 0 weeks Group2 10
11: 2 name2 trt4 Fixed 10.0 10 mg Group2 0 weeks Group1 11
12: 2 name2 trt4 Fixed 10.0 10 mg Group2 0 weeks Group2 12
13: 2 name2 trt5 Fixed 20.0 20 mg Group2 0 weeks Group1 13
14: 2 name2 trt5 Fixed 20.0 20 mg Group2 0 weeks Group2 14
15: 3 name3 trt6 Flexible 10.0 40 mg Group1 0 weeks Group1 15
16: 3 name3 trt2 Flexible NA NA <NA> Group1 0 weeks Group1 16
17: 3 name3 trt6 Flexible 10.0 40 mg Group1 8 Weeks Group1 17
18: 3 name3 trt2 Flexible NA NA <NA> Group1 8 weeks Group1 18
預計 output:
ID name var1 var2 var3 var4 var5 match1 time time_unit match2 n
1: 1 name1 trt1 Flexible 10.0 40 mg Group1 6 Weeks <NA> 1
2: 1 name1 trt2 Flexible NA NA <NA> Group1 6 Weeks Group1 2
3: 1 name1 trt3 Flexible 12.5 50 mg Group1 8 Weeks <NA> 3
4: 1 name1 trt1 Flexible 10.0 40 mg Group1 8 Weeks <NA> 4
5: 1 name1 trt2 Flexible NA NA <NA> Group1 8 Weeks Group1 5
6: 2 name2 trt4 Fixed 10.0 10 mg Group1 0 weeks Group1 6
7: 2 name2 trt5 Fixed 20.0 20 mg Group1 0 weeks Group1 8
8: 2 name2 trt5 Fixed 20.0 20 mg Group1 0 weeks Group3 9
9: 2 name2 trt4 Fixed 10.0 10 mg Group2 0 weeks Group2 12
10: 2 name2 trt5 Fixed 20.0 20 mg Group2 0 weeks Group2 14
11: 3 name3 trt6 Flexible 10.0 40 mg Group1 0 weeks Group1 15
12: 3 name3 trt2 Flexible NA NA <NA> Group1 0 weeks Group1 16
13: 3 name3 trt6 Flexible 10.0 40 mg Group1 8 weeks Group1 17
14: 3 name3 trt2 Flexible NA NA <NA> Group1 8 weeks Group1 18
這是輸出數據:
structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), name = c("name1", "name1",
"name1", "name1", "name1", "name2", "name2", "name2", "name2",
"name2", "name2", "name2", "name2", "name2", "name3", "name3",
"name3", "name3", "name3"), var1 = c("trt1", "trt2", "trt3",
"trt1", "trt2", "trt4", "trt4", "trt5", "trt5", "trt5", "trt4",
"trt4", "trt5", "trt5", "trt6", "trt2", "trt6", "trt2", "trt6"
), var2 = c("Flexible", "Flexible", "Flexible", "Flexible", "Flexible",
"Fixed", "Fixed", "Fixed", "Fixed", "Fixed", "Fixed", "Fixed",
"Fixed", "Fixed", "Flexible", "Flexible", "Flexible", "Flexible",
"Flexible"), var3 = c(10, NA, 12.5, 10, NA, 10, 10, 20, 20, 20,
10, 10, 20, 20, 10, NA, 10, NA, 10), var4 = c(40L, NA, 50L, 40L,
NA, 10L, 10L, 20L, 20L, 20L, 10L, 10L, 20L, 20L, 40L, NA, 40L,
NA, 40L), var5 = c("mg", NA, "mg", "mg", NA, "mg", "mg", "mg",
"mg", "mg", "mg", "mg", "mg", "mg", "mg", NA, "mg", NA, "mg"),
match1 = c("Group1", "Group1", "Group1", "Group1", "Group1",
"Group1", "Group1", "Group1", "Group1", "Group1", "Group2",
"Group2", "Group2", "Group2", "Group1", "Group1", "Group1",
"Group1", "Group1"), time = c(6L, 6L, 8L, 8L, 8L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 8L, 8L), time_unit = c("Weeks",
"Weeks", "Weeks", "Weeks", "Weeks", "weeks", "weeks", "weeks",
"weeks", "weeks", "weeks", "weeks", "weeks", "weeks", "weeks",
"weeks", "weeks", "weeks", "Weeks"), match2 = c(NA, "Group1",
NA, NA, "Group1", "Group1", "Group2", "Group1", "Group3",
"Group2", "Group1", "Group2", "Group1", "Group2", "Group1",
"Group1", "Group1", "Group1", "Group1"), n = 1:19), row.names = c(NA,
-19L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x0000019a9a14a3e0>)
我嘗試了不同的方法,但在網上找不到類似的東西。 我如何匹配但也創建一種關於允許匹配的字典/列表?
我可能會使用 case_when 讓列匹配任意值。 但是,我不確定分組在這里有什么影響,所以我不確定我是否完全理解你想要做什么。 也許這會幫助你開始
library(tidyverse)
df <- tibble(
ID = c(
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L
), name = c(
"name1", "name1",
"name1", "name1", "name1", "name2", "name2", "name2", "name2",
"name2", "name2", "name2", "name2", "name2", "name3", "name3",
"name3", "name3", "name3"
), var1 = c(
"trt1", "trt2", "trt3",
"trt1", "trt2", "trt4", "trt4", "trt5", "trt5", "trt5", "trt4",
"trt4", "trt5", "trt5", "trt6", "trt2", "trt6", "trt2", "trt6"
), var2 = c(
"Flexible", "Flexible", "Flexible", "Flexible", "Flexible",
"Fixed", "Fixed", "Fixed", "Fixed", "Fixed", "Fixed", "Fixed",
"Fixed", "Fixed", "Flexible", "Flexible", "Flexible", "Flexible",
"Flexible"
), var3 = c(
10, NA, 12.5, 10, NA, 10, 10, 20, 20, 20,
10, 10, 20, 20, 10, NA, 10, NA, 10
), var4 = c(
40L, NA, 50L, 40L,
NA, 10L, 10L, 20L, 20L, 20L, 10L, 10L, 20L, 20L, 40L, NA, 40L,
NA, 40L
), var5 = c(
"mg", NA, "mg", "mg", NA, "mg", "mg", "mg",
"mg", "mg", "mg", "mg", "mg", "mg", "mg", NA, "mg", NA, "mg"
),
match1 = c(
"Group1", "Group1", "Group1", "Group1", "Group1",
"Group1", "Group1", "Group1", "Group1", "Group1", "Group2",
"Group2", "Group2", "Group2", "Group1", "Group1", "Group1",
"Group1", "Group1"
), time = c(
6L, 6L, 8L, 8L, 8L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 8L
), time_unit = c(
"Weeks",
"Weeks", "Weeks", "Weeks", "Weeks", "weeks", "weeks", "weeks",
"weeks", "weeks", "weeks", "weeks", "weeks", "weeks", "weeks",
"weeks", "weeks", "weeks", "Weeks"
), match2 = c(
NA, "Group1",
NA, NA, "Group1", "Group1", "Group2", "Group1", "Group3",
"Group2", "Group1", "Group2", "Group1", "Group2", "Group1",
"Group1", "Group1", "Group1", "Group1"
), n = 1:19
)
df %>%
group_by(across(.cols = c(ID, name, starts_with("var"), time, time_unit))) %>%
mutate(custom_filter = case_when(
match1 == match2 ~ 1,
is.na(match1) | is.na(match2) ~ 1,
match1 == "Group1" & match2 == "Group3" ~ 1,
TRUE ~ 0))
#> # A tibble: 19 x 13
#> # Groups: ID, name, var1, var2, var3, var4, var5, time, time_unit [10]
#> ID name var1 var2 var3 var4 var5 match1 time time_~1 match2 n
#> <int> <chr> <chr> <chr> <dbl> <int> <chr> <chr> <int> <chr> <chr> <int>
#> 1 1 name1 trt1 Flexib~ 10 40 mg Group1 6 Weeks <NA> 1
#> 2 1 name1 trt2 Flexib~ NA NA <NA> Group1 6 Weeks Group1 2
#> 3 1 name1 trt3 Flexib~ 12.5 50 mg Group1 8 Weeks <NA> 3
#> 4 1 name1 trt1 Flexib~ 10 40 mg Group1 8 Weeks <NA> 4
#> 5 1 name1 trt2 Flexib~ NA NA <NA> Group1 8 Weeks Group1 5
#> 6 2 name2 trt4 Fixed 10 10 mg Group1 0 weeks Group1 6
#> 7 2 name2 trt4 Fixed 10 10 mg Group1 0 weeks Group2 7
#> 8 2 name2 trt5 Fixed 20 20 mg Group1 0 weeks Group1 8
#> 9 2 name2 trt5 Fixed 20 20 mg Group1 0 weeks Group3 9
#> 10 2 name2 trt5 Fixed 20 20 mg Group1 0 weeks Group2 10
#> 11 2 name2 trt4 Fixed 10 10 mg Group2 0 weeks Group1 11
#> 12 2 name2 trt4 Fixed 10 10 mg Group2 0 weeks Group2 12
#> 13 2 name2 trt5 Fixed 20 20 mg Group2 0 weeks Group1 13
#> 14 2 name2 trt5 Fixed 20 20 mg Group2 0 weeks Group2 14
#> 15 3 name3 trt6 Flexib~ 10 40 mg Group1 0 weeks Group1 15
#> 16 3 name3 trt2 Flexib~ NA NA <NA> Group1 0 weeks Group1 16
#> 17 3 name3 trt6 Flexib~ 10 40 mg Group1 0 weeks Group1 17
#> 18 3 name3 trt2 Flexib~ NA NA <NA> Group1 0 weeks Group1 18
#> 19 3 name3 trt6 Flexib~ 10 40 mg Group1 8 Weeks Group1 19
#> # ... with 1 more variable: `case_when(...)` <dbl>, and abbreviated variable
#> # name 1: time_unit
創建於 2022-11-23,使用reprex v2.0.2
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.