将自定义函数应用于数据框中的列对

Question

我想将自定义函数应用于数据帧中的所有列对，以获得结果的 apxp 矩阵/数据帧。 在 tidyverse 中有没有一种快速的方法来做到这一点？

输出应该是results数据框。

custom_function <- function(x, y){
  sum(x, y)
}

set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))

result <- tibble(cols = c("x","y","z"), 
                 x = c(custom_function(data$x, data$x), custom_function(data$x, data$y), custom_function(data$x, data$z)),
                 y = c(custom_function(data$y, data$x), custom_function(data$y, data$y), custom_function(data$y, data$z)),
                 z = c(custom_function(data$z, data$x), custom_function(data$z, data$y), custom_function(data$z, data$z)))

result

Answer 1

您可以使用以下解决方案：

library(dplyr)
library(tibble)

expand.grid(names(data), names(data)) %>%
  rowwise() %>%
  mutate(Res = custom_function(data[as.character(Var1)], data[as.character(Var2)])) %>%
  pivot_wider(names_from = unique("Var1"), values_from = "Res") %>% 
  column_to_rownames("Var2")

           x        y         z
x -0.3591433 2.157343 -1.470995
y  2.1573430 4.673829  1.045491
z -1.4709953 1.045491 -2.582847

Answer 2

如果custom_function是矢量化的，我们可以直接使用outer 。 但它是使用sum这是一个标量函数，所以我们可以通过它包裹使用它Vectorize()在FUN =在参数outer 。 像这样做-

outer(names(data),names(data), FUN = Vectorize(function(x, y) custom_function (data[x], data[y])))

tidyverse 策略虽然有点冗长，但如果你愿意，你可以在tidyverse管理这种方法。

library(tidyverse)
custom_function <- function(x, y){
  sum(x, y)
}

set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))

expand.grid(names(data), names(data)) %>% 
  mutate(val = map2(Var1, Var2, ~ custom_function(data[.x], data[.y]))) %>%
  pivot_wider(id_cols = Var1 ,names_from = Var2, values_from = val, values_fn = first) %>%
  column_to_rownames('Var1') %>%
  as.matrix()
#>            x        y         z
#> x -0.3591433 2.157343 -1.470995
#> y  2.1573430 4.673829  1.045491
#> z -1.4709953 1.045491 -2.582847

^{由reprex 包( v2.0.0 ) 于 2021 年 7 月 10 日创建}

Answer 3

一个想法：

library(dplyr, warn.conflicts = FALSE)

custom_function <- function(x, y) {
  sum(x, y)
}

set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))

data_long <-
  data %>%
  mutate(id = 1:nrow(.)) %>%
  tidyr::pivot_longer(cols = -id)

result <-  
  data_long %>%
  inner_join(data_long, by = "id") %>%
  group_by(name.x, name.y) %>%
  summarize(value = custom_function(value.x, value.y),
            .groups = "drop") %>%
  tidyr::pivot_wider(names_from = name.x, values_from = value) %>%
  rename(cols = name.y)

result
#> # A tibble: 3 x 4
#>   cols       x     y     z
#>   <chr>  <dbl> <dbl> <dbl>
#> 1 x     -0.359  2.16 -1.47
#> 2 y      2.16   4.67  1.05
#> 3 z     -1.47   1.05 -2.58

^{由reprex 包( v2.0.0 ) 于 2021 年 7 月 10 日创建}

在这里它被组织成一个函数：

library(dplyr, warn.conflicts = FALSE)

custom_function <- function(x, y) {
  sum(x, y)
}

set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))

custom_summ <- function(df, f) {
  
  data_long <-
    data %>%
    mutate(id = 1:nrow(.)) %>%
    tidyr::pivot_longer(cols = -id)

  result <-  
    data_long %>%
    inner_join(data_long, by = "id") %>%
    group_by(name.x, name.y) %>%
    summarize(value = f(value.x, value.y),
              .groups = "drop") %>%
    tidyr::pivot_wider(names_from = name.x, values_from = value) %>%
    rename(cols = name.y)
  
  result
}

custom_summ(data, custom_function)
#> # A tibble: 3 x 4
#>   cols       x     y     z
#>   <chr>  <dbl> <dbl> <dbl>
#> 1 x     -0.359  2.16 -1.47
#> 2 y      2.16   4.67  1.05
#> 3 z     -1.47   1.05 -2.58

^{由reprex 包( v2.0.0 ) 于 2021 年 7 月 10 日创建}

以下是各种选项的一些基准数据。 如果性能完全是一个问题，那么接受的答案中提供的 tidyverse 方法就不是一个好方法。 此处最快的选项是问题评论中提供的基于sapply的选项。

library(tidyverse)

custom_function <- function(x, y) {
  sum(x, y)
}

set.seed(100)

get_data <- function() {
  data <- lapply(letters, function(i) rnorm(1000))
  names(data) <- letters
  as_tibble(data)
}

custom_summ <- function(df, f) {
  
  data_long <-
    data %>%
    mutate(id = 1:nrow(.)) %>%
    pivot_longer(cols = -id)
  
  result <-  
    data_long %>%
    inner_join(data_long, by = "id") %>%
    group_by(name.x, name.y) %>%
    summarize(value = f(value.x, value.y),
              .groups = "drop") %>%
    pivot_wider(names_from = name.x, values_from = value) %>%
    rename(cols = name.y)
  
  result
}

data <- get_data()

system.time(custom_summ(data, custom_function))
#>    user  system elapsed 
#>   0.053   0.007   0.062

custom_summ_2 <- function(data, f) {
  expand.grid(names(data), names(data)) %>% 
  mutate(val = map2(Var1, Var2, ~ f(data[.x], data[.y]))) %>%
  pivot_wider(id_cols = Var1 ,names_from = Var2, values_from = val, values_fn = first) %>%
  column_to_rownames('Var1') %>%
  as.matrix()
}

system.time(custom_summ_2(data, custom_function))
#>    user  system elapsed 
#>  26.479   0.317  27.365

custom_summ_3 <- function(data, f) {
  expand.grid(names(data), names(data)) %>%
    rowwise() %>%
    mutate(Res = f(data[as.character(Var1)], data[as.character(Var2)])) %>%
    pivot_wider(names_from = unique("Var1"), values_from = "Res") %>% 
    column_to_rownames("Var2")
}

system.time(custom_summ_3(data, custom_function))
#>    user  system elapsed 
#>   0.048   0.001   0.049

custom_summ_4 <- function(data, f) {
  sapply(data, function(y) sapply(data, f, y = y))
}

system.time(custom_summ_4(data, custom_function))
#>    user  system elapsed 
#>   0.003   0.000   0.003

custom_summ_5 <- function(data, f) {
  outer(names(data), names(data), 
        FUN = Vectorize(function(x, y) f (data[x], data[y])))
}

system.time(custom_summ_5(data, custom_function))
#>    user  system elapsed 
#>   0.044   0.001   0.045

^{由reprex 包( v2.0.0 ) 于 2021 年 7 月 11 日创建}

将自定义函数应用于数据框中的列对

问题描述

3 个解决方案

解决方案1
3 2021-07-10 15:53:33

解决方案2
2 已采纳 2021-07-10 16:17:25

解决方案3
1 2021-07-10 16:15:23

将自定义函数应用于数据框中的列对

问题描述

3 个解决方案

解决方案1 3 2021-07-10 15:53:33

解决方案2 2 已采纳 2021-07-10 16:17:25

解决方案3 1 2021-07-10 16:15:23

解决方案1
3 2021-07-10 15:53:33

解决方案2
2 已采纳 2021-07-10 16:17:25

解决方案3
1 2021-07-10 16:15:23