[英]Apply a custom function to pairs of columns in a dataframe
我想将自定义函数应用于数据帧中的所有列对,以获得结果的 apxp 矩阵/数据帧。 在 tidyverse 中有没有一种快速的方法来做到这一点?
输出应该是results
数据框。
custom_function <- function(x, y){
sum(x, y)
}
set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))
result <- tibble(cols = c("x","y","z"),
x = c(custom_function(data$x, data$x), custom_function(data$x, data$y), custom_function(data$x, data$z)),
y = c(custom_function(data$y, data$x), custom_function(data$y, data$y), custom_function(data$y, data$z)),
z = c(custom_function(data$z, data$x), custom_function(data$z, data$y), custom_function(data$z, data$z)))
result
您可以使用以下解决方案:
library(dplyr)
library(tibble)
expand.grid(names(data), names(data)) %>%
rowwise() %>%
mutate(Res = custom_function(data[as.character(Var1)], data[as.character(Var2)])) %>%
pivot_wider(names_from = unique("Var1"), values_from = "Res") %>%
column_to_rownames("Var2")
x y z
x -0.3591433 2.157343 -1.470995
y 2.1573430 4.673829 1.045491
z -1.4709953 1.045491 -2.582847
如果custom_function
是矢量化的,我们可以直接使用outer
。 但它是使用sum
这是一个标量函数,所以我们可以通过它包裹使用它Vectorize()
在FUN =
在参数outer
。 像这样做-
outer(names(data),names(data), FUN = Vectorize(function(x, y) custom_function (data[x], data[y])))
tidyverse 策略虽然有点冗长,但如果你愿意,你可以在tidyverse
管理这种方法。
library(tidyverse)
custom_function <- function(x, y){
sum(x, y)
}
set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))
expand.grid(names(data), names(data)) %>%
mutate(val = map2(Var1, Var2, ~ custom_function(data[.x], data[.y]))) %>%
pivot_wider(id_cols = Var1 ,names_from = Var2, values_from = val, values_fn = first) %>%
column_to_rownames('Var1') %>%
as.matrix()
#> x y z
#> x -0.3591433 2.157343 -1.470995
#> y 2.1573430 4.673829 1.045491
#> z -1.4709953 1.045491 -2.582847
一个想法:
library(dplyr, warn.conflicts = FALSE)
custom_function <- function(x, y) {
sum(x, y)
}
set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))
data_long <-
data %>%
mutate(id = 1:nrow(.)) %>%
tidyr::pivot_longer(cols = -id)
result <-
data_long %>%
inner_join(data_long, by = "id") %>%
group_by(name.x, name.y) %>%
summarize(value = custom_function(value.x, value.y),
.groups = "drop") %>%
tidyr::pivot_wider(names_from = name.x, values_from = value) %>%
rename(cols = name.y)
result
#> # A tibble: 3 x 4
#> cols x y z
#> <chr> <dbl> <dbl> <dbl>
#> 1 x -0.359 2.16 -1.47
#> 2 y 2.16 4.67 1.05
#> 3 z -1.47 1.05 -2.58
由reprex 包( v2.0.0 ) 于 2021 年 7 月 10 日创建
在这里它被组织成一个函数:
library(dplyr, warn.conflicts = FALSE)
custom_function <- function(x, y) {
sum(x, y)
}
set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))
custom_summ <- function(df, f) {
data_long <-
data %>%
mutate(id = 1:nrow(.)) %>%
tidyr::pivot_longer(cols = -id)
result <-
data_long %>%
inner_join(data_long, by = "id") %>%
group_by(name.x, name.y) %>%
summarize(value = f(value.x, value.y),
.groups = "drop") %>%
tidyr::pivot_wider(names_from = name.x, values_from = value) %>%
rename(cols = name.y)
result
}
custom_summ(data, custom_function)
#> # A tibble: 3 x 4
#> cols x y z
#> <chr> <dbl> <dbl> <dbl>
#> 1 x -0.359 2.16 -1.47
#> 2 y 2.16 4.67 1.05
#> 3 z -1.47 1.05 -2.58
由reprex 包( v2.0.0 ) 于 2021 年 7 月 10 日创建
以下是各种选项的一些基准数据。 如果性能完全是一个问题,那么接受的答案中提供的 tidyverse 方法就不是一个好方法。 此处最快的选项是问题评论中提供的基于sapply
的选项。
library(tidyverse)
custom_function <- function(x, y) {
sum(x, y)
}
set.seed(100)
get_data <- function() {
data <- lapply(letters, function(i) rnorm(1000))
names(data) <- letters
as_tibble(data)
}
custom_summ <- function(df, f) {
data_long <-
data %>%
mutate(id = 1:nrow(.)) %>%
pivot_longer(cols = -id)
result <-
data_long %>%
inner_join(data_long, by = "id") %>%
group_by(name.x, name.y) %>%
summarize(value = f(value.x, value.y),
.groups = "drop") %>%
pivot_wider(names_from = name.x, values_from = value) %>%
rename(cols = name.y)
result
}
data <- get_data()
system.time(custom_summ(data, custom_function))
#> user system elapsed
#> 0.053 0.007 0.062
custom_summ_2 <- function(data, f) {
expand.grid(names(data), names(data)) %>%
mutate(val = map2(Var1, Var2, ~ f(data[.x], data[.y]))) %>%
pivot_wider(id_cols = Var1 ,names_from = Var2, values_from = val, values_fn = first) %>%
column_to_rownames('Var1') %>%
as.matrix()
}
system.time(custom_summ_2(data, custom_function))
#> user system elapsed
#> 26.479 0.317 27.365
custom_summ_3 <- function(data, f) {
expand.grid(names(data), names(data)) %>%
rowwise() %>%
mutate(Res = f(data[as.character(Var1)], data[as.character(Var2)])) %>%
pivot_wider(names_from = unique("Var1"), values_from = "Res") %>%
column_to_rownames("Var2")
}
system.time(custom_summ_3(data, custom_function))
#> user system elapsed
#> 0.048 0.001 0.049
custom_summ_4 <- function(data, f) {
sapply(data, function(y) sapply(data, f, y = y))
}
system.time(custom_summ_4(data, custom_function))
#> user system elapsed
#> 0.003 0.000 0.003
custom_summ_5 <- function(data, f) {
outer(names(data), names(data),
FUN = Vectorize(function(x, y) f (data[x], data[y])))
}
system.time(custom_summ_5(data, custom_function))
#> user system elapsed
#> 0.044 0.001 0.045
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.