[英]How to set to 0 all values that appeares less than k times in variables within nested df
library(tidyverse)
ex <- structure(list(group = c("Group A", "Group B", "Group C"), data = list(
structure(list(a = c(25.1, 15.1, 28.7, 29.7, 5.3, 3.4, 5.3,
10.1, 2.4, 18, 4.7, 22.1, 9.5, 3.1, 26.5, 5.1, 24, 22.5,
19.4, 22.9, 24.5, 18.2, 7.9, 5.3, 24.7), b = c(95.1, 51,
100, 94.1, 47.3, 0, 50.7, 45.8, 40.7, 49.4, 51.9, 76.4, 26.7,
19.8, 37.4, 59.4, 59.1, 60.2, 26.1, 2.8, 100, 40.7, 56.4,
42.5, 0), c = c(39.9, 42.7, 16.3, 11.1, 56.9, 17.8, 62, 28.1,
43, 44.8, 54.8, 8.7, 5.5, 40.2, 7.7, 60.7, 24.8, 7.5, 3.5,
16.9, 31.6, 45.8, 76.7, 58.6, 15.8), d = c(-2.39999999999999,
28.6, -4.59999999999999, -1.39999999999999, 10.3, 3.1, 23.4,
-43, -36.3, 32.4, 33.1, 9.8, 1.5, -17.6, 16.6, 20.9, 7.8,
-1.7, -23.3, 0, -15, 59.3, -40.2, 46.9, 4.7)), .Names = c("a",
"b", "c", "d"), row.names = c(NA, -25L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(a = c(5, 4.7, 30.3,
14.3, 31.6, 6, 4.9, 23.3, 26.9, 16.9, 27.2, 23.8, 19.9, 28.6,
9.9, 17.4, 14.3, 12.5, 30.4, 30.3, 30, 6, 18, 23.7, 5.1),
b = c(48.9, 41.3, 20.1, 63.7, 85.1, 30.3, 52.8, 49.7,
27.1, 51.6, 21.8, 52.4, 52.5, 59.6, 13.7, 53.1, 69, 66.9,
23.4, 35.4, 45.8, 23.7, 62.9, 90.3, 59.6), c = c(37.4,
18.5, 64.6, 13.5, 7.8, 6.8, 12.7, 8.5, 7.8, 5.4, 14.1,
20.5, 10.9, 10.5, 7.5, 14.7, 6.9, 0.699999999999999,
4.7, 1.9, 11.9, 0.9, 7.2, 9.2, 42.2), d = c(4.9, -3.7,
13.5, 21.9, -2.69999999999999, 6.6, 0.5, -12.3, 38.7,
-25.8, -18, 28.4, 38.3, -3.6, 39.4, 19, 23.4, -38.7,
17, 36.3, -31.7, -9.3, -10.5, 9.7, -10.6)), .Names = c("a",
"b", "c", "d"), row.names = c(NA, -25L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(a = c(29.9, 12.8, 23.9,
26.2, 27.5, 32.6, 33.2, 24.8, 29, 22.6, 4.7, 25.6, 4.7, 13.1,
25.9, 14.5, 23.5, 26.6, 12.8, 24.1, 9.1, 31.9, 24.8, 4.6,
17.9), b = c(63.7, 23.3, 71.2, 46.7, 30.6, 49.3, 14.6, 68.4,
27.9, 49.1, 60.5, 26.4, 56.9, 55.4, 37.9, 40.7, 32.7, 68.5,
42.7, 27.9, 67.5, 43.4, 76.6, 53.3, 26.8), c = c(1.6, 32,
18.6, 14, 0.5, 7.2, 27.3, 8.9, 11, 15.5, 16.7, 16.4, 63.1,
14.7, 6.8, 9, 3.1, 11.7, 11, 11.5, 10.6, 14.9, 7.1, 13.2,
5.1), d = c(-35.4, 21, 12, 1.8, 37.6, 9.2, 17.6, 0, -19.4,
32.6, -32, -3.6, 7.2, -25.7, 9.1, -8, 35.8, 24.8, -13.9,
-21.7, -28.7, 0.200000000000003, -16.9, -26.5, 26.2)), .Names = c("a",
"b", "c", "d"), row.names = c(NA, -25L), class = c("tbl_df",
"tbl", "data.frame"))), h_candidates = list(structure(c(0.17320508075689, 2.37782856461527, 2.94890646051978, 3.35205778704499, 3.66771041547043, 3.95224618679369), .Names = c("0%", "0.01%", "0.02%", "0.03%", "0.04%", "0.05%")), structure(c(0.316227766016836, 2.63452963884554, 3.2327619513522, 3.63593179253957, 3.97743636027027, 4.22137418384109), .Names = c("0%", "0.01%", "0.02%", "0.03%", "0.04%", "0.05%")), structure(c(0.316227766016837, 2.7258026340878, 3.24807635378234, 3.62353418639869, 3.92683078321437, 4.17731971484109), .Names = c("0%", "0.01%", "0.02%", "0.03%", "0.04%", "0.05%"))), assignment = list(
structure(list(`0%` = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25),
`0.01%` = c(1, 2, 3, 3, 4, 5, 4, 6, 7, 8, 9, 10, 11,
12, 13, 4, 14, 15, 16, 17, 18, 19, 20, 21, 17), `0.02%` = c(1,
2, 3, 3, 4, 5, 4, 6, 7, 8, 9, 10, 11, 12, 13, 4, 14,
15, 16, 17, 18, 19, 20, 21, 17), `0.03%` = c(1, 2, 3,
3, 4, 5, 4, 6, 7, 8, 9, 10, 11, 12, 13, 4, 10, 14, 15,
16, 17, 18, 19, 9, 16), `0.04%` = c(1, 2, 3, 4, 5, 6,
5, 7, 8, 9, 10, 11, 12, 13, 14, 5, 11, 15, 16, 17, 18,
19, 20, 10, 17)), .Names = c("0%", "0.01%", "0.02%",
"0.03%", "0.04%"), row.names = c(NA, -25L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(`0%` = c(1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
21, 22, 23, 24, 25), `0.01%` = c(1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 4, 17, 18, 19, 20, 21, 22,
23, 24), `0.02%` = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 13, 4, 16, 17, 9, 18, 19, 14, 20, 21), `0.03%` = c(1,
2, 3, 4, 5, 6, 2, 7, 8, 9, 10, 11, 12, 13, 14, 12, 4, 15,
6, 8, 16, 17, 13, 18, 19), `0.04%` = c(1, 2, 3, 4, 5, 6,
2, 7, 8, 9, 10, 11, 12, 13, 14, 12, 4, 15, 6, 8, 7, 16, 13,
17, 1)), .Names = c("0%", "0.01%", "0.02%", "0.03%", "0.04%"
), row.names = c(NA, -25L), class = c("tbl_df", "tbl", "data.frame"
)), structure(list(`0%` = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25
), `0.01%` = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 12, 15, 16, 17, 15, 18, 19, 4, 20, 21, 22), `0.02%` = c(1,
2, 3, 4, 5, 6, 7, 8, 9, 5, 10, 11, 12, 13, 11, 14, 5, 15,
14, 16, 17, 18, 8, 19, 20), `0.03%` = c(1, 2, 3, 4, 5, 6,
7, 3, 8, 9, 10, 11, 12, 10, 11, 13, 5, 14, 13, 8, 10, 4,
3, 13, 6), `0.04%` = c(1, 2, 3, 4, 5, 5, 6, 3, 7, 8, 9, 10,
11, 9, 10, 12, 5, 13, 12, 7, 9, 4, 3, 12, 5)), .Names = c("0%",
"0.01%", "0.02%", "0.03%", "0.04%"), row.names = c(NA, -25L
), class = c("tbl_df", "tbl", "data.frame")))), .Names = c("group", "data", "h_candidates", "assignment"), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -3L))
使用上述結構的數據,我想更改賦值data.frames
中列中少於k
次(假設k = 5
)的所有值。
所以,我需要一個解決方案,需要后續data.frames
一個范圍內,則后續列data.frame
,檢查其值出現在5倍以內的一列,如果有任何只是替換它們0
。
充其量,解決方案將涉及tidyverse
函數。 我認為這里需要嵌套的purrr::map
和dplyr::mutate
,但是不知道如何計算列中的外觀並替換值。
您可以使用purrr::map()
遍歷帶有數據幀的列表列,然后使用purrr::modify()
遍歷每個數據幀中的每一列。 然后,只需定義一個函數即可對向量中值的出現進行計數,並在計數小於k
替換它們:
library(tidyverse)
ex %>%
mutate(assignment = map(assignment, modify, function(x, k) {
n <- table(x)[as.character(x)]
replace(x, n < k, 0)
}, k = 5))
#> # A tibble: 3 x 4
#> group data h_candidates assignment
#> <chr> <list> <list> <list>
#> 1 Group A <tibble [25 x 4]> <dbl [6]> <tibble [25 x 5]>
#> 2 Group B <tibble [25 x 4]> <dbl [6]> <tibble [25 x 5]>
#> 3 Group C <tibble [25 x 4]> <dbl [6]> <tibble [25 x 5]>
我們還可以定義一些輔助函數以使其更具可讀性:
# Replace elements in x given by f(x) with val
replace_if <- function(x, f, val, ...) {
replace(x, f(x, ...), val)
}
appears_less_than <- function(x, k) {
table(x)[as.character(x)] < k
}
結合這兩個功能,我們可以得到:
replace_if(c(1, 1, 2, 3), appears_less_than, k = 2, 0)
#> [1] 1 1 0 0
現在剩下的就是將各個部分放在一起:
res <- ex %>%
mutate(assignment = map(assignment, modify, replace_if,
appears_less_than, k = 3, 0))
正如@thothal所提到的,您的數據中沒有任何值出現超過4次的情況,但是在k = 3
我們可以看一下結果(僅說明assignment
的第三個數據幀):
res %>% pluck("assignment", 3)
#> # A tibble: 25 x 5
#> `0%` `0.01%` `0.02%` `0.03%` `0.04%`
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0 0 0 0 0
#> 2 0 0 0 0 0
#> 3 0 0 0 3 3
#> 4 0 0 0 0 0
#> 5 0 0 5 0 5
#> 6 0 0 0 0 5
#> 7 0 0 0 0 0
#> 8 0 0 0 3 3
#> 9 0 0 0 0 0
#> 10 0 0 5 0 0
#> # ... with 15 more rows
最后,我們還可以使用范圍限定的mutate_at()
來進一步減少一些多余的語法:
ex %>%
mutate_at(vars(assignment), map, modify,
replace_if, appears_less_than, k = 3, 0)
由reprex軟件包 (v0.2.0.9000)創建於2018-08-08。
這應該可以解決問題:
library(tidyverse)
ex %>%
mutate(
assignment = map(assignment,
~ rowid_to_column(.x, "id") %>%
gather(key, value, -id) %>%
group_by(key) %>%
add_count(value) %>%
mutate(value = ifelse(n < 5, 0, n)) %>%
select(-n) %>%
spread(key, value) %>%
select(-id)
)
)
請注意,在您的示例中,沒有一個值出現超過4次。
說明
map
所有分配data.frames
data.frame
添加一個id
列(需要data.frame
gather/spread
) gather all columns but
into a
鍵(former column names)
值(值)對中 group
的前柱(現在的key
),你再添加鍵值的計數器value
0
n
(計數器) spread
回原始格式 id
列
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.