如何创建虚拟变量来指示组内其他观察值的存在因素？

Question

我正在使用如下数据框，其中Color和 `Player 是因子变量：

我想为颜色列的每个值创建指标变量。 但是，我希望这些指示变量表示颜色是否存在于同一游戏中的其他玩家（而不是该玩家是否存在）。 所以我希望上表变成：

我想代码会有group_by(Game) %>% ，但除此之外我迷路了。

数据：

structure(list(Game = c("A", "A", "A", "B", "B", "B"), Player = c(1L, 
2L, 3L, 1L, 2L, 3L), Color = c("Red", "Green", "Blue", "Green", 
"Purple", "Yellow"), Blue = c(1L, 1L, 0L, 0L, 0L, 0L), Green = c(1L, 
0L, 1L, 0L, 1L, 1L), Yellow = c(0L, 0L, 0L, 1L, 1L, 0L), Red = c(0L, 
1L, 1L, 0L, 0L, 0L), Purple = c(0L, 0L, 0L, 1L, 0L, 1L)), class = "data.frame", row.names = c(NA, 
-6L))

Answer 1

这是我们可以做到的一种方式：

首先，我们使用model.matrix()将其乘以1并在abs()的包装中减去1 。 然后我们几乎得到了所需的输出，唯一剩下的就是在不存在颜色的情况下获取零。 我们mutate across...来做到这一点：


library(dplyr)
df %>% 
  cbind(abs((model.matrix(~ Color + 0, .) == 1)*1-1)) %>% 
  group_by(Game) %>% 
  mutate(across(-c(Player, Color), ~case_when(sum(.)==3 ~0, 
                                              TRUE ~ .)))

  Game  Player Color  ColorBlue ColorGreen ColorPurple ColorRed ColorYellow
  <chr>  <int> <chr>      <dbl>      <dbl>       <dbl>    <dbl>       <dbl>
1 A          1 Red            1          1           0        0           0
2 A          2 Green          1          0           0        1           0
3 A          3 Blue           0          1           0        1           0
4 B          1 Green          0          0           1        0           1
5 B          2 Purple         0          1           0        0           1
6 B          3 Yellow         0          1           1        0           0
>

Answer 2

也许这有帮助 - 通过“游戏”拆分“颜色”列，通过比较“颜色”的元素（ != ）创建二进制矩阵，转换为tibble ，行绑定（ _dfr ）并将数据集与原始数据集绑定（ bind_cols ）

library(purrr)
library(dplyr)
library(tidyr)
map_dfr(split(df1$Color, df1$Game), ~ {
   m1 <- +(outer(.x, .x, FUN = `!=`))
  colnames(m1) <- .x
  as_tibble(m1)}) %>%
  mutate(across(everything(), replace_na, 0)) %>%
  bind_cols(df1, .)

-输出

  Game Player  Color Red Green Blue Purple Yellow
1    A      1    Red   0     1    1      0      0
2    A      2  Green   1     0    1      0      0
3    A      3   Blue   1     1    0      0      0
4    B      1  Green   0     0    0      1      1
5    B      2 Purple   0     1    0      0      1
6    B      3 Yellow   0     1    0      1      0

或者另一种选择是使用dummy_cols然后修改输出

library(fastDummies)
library(stringr)
dummy_cols(df1, 'Color') %>%
  rename_with(~ str_remove(.x, "Color_")) %>%
  group_by(Game) %>% 
  mutate(across(Blue:Yellow, ~ +(Color != cur_column() & any(.x)))) %>%
   ungroup

-输出

# A tibble: 6 × 8
  Game  Player Color   Blue Green Purple   Red Yellow
  <chr>  <int> <chr>  <int> <int>  <int> <int>  <int>
1 A          1 Red        1     1      0     0      0
2 A          2 Green      1     0      0     1      0
3 A          3 Blue       0     1      0     1      0
4 B          1 Green      0     0      1     0      1
5 B          2 Purple     0     1      0     0      1
6 B          3 Yellow     0     1      1     0      0

数据

df1 <- structure(list(Game = c("A", "A", "A", "B", "B", "B"), Player = c(1L, 
2L, 3L, 1L, 2L, 3L), Color = c("Red", "Green", "Blue", "Green", 
"Purple", "Yellow")), row.names = c(NA, -6L), class = "data.frame")

Answer 3

这是使用tidyverse中的full_join和pivot_wider的另一种方法。 我相信这也给出了相同的结果。 包含filter以避免与 1 相同的颜色指示符。

library(tidyverse)

full_join(df, df, by = "Game", suffix = c("", "_Two")) %>%
  filter(Color != Color_Two) %>%
  mutate(val = 1) %>%
  pivot_wider(id_cols = c(Game, Player, Color), 
              names_from = Color_Two, 
              values_from = val,
              values_fill = 0)

输出

  Game  Player Color  Green  Blue   Red Purple Yellow
  <chr>  <int> <chr>  <dbl> <dbl> <dbl>  <dbl>  <dbl>
1 A          1 Red        1     1     0      0      0
2 A          2 Green      0     1     1      0      0
3 A          3 Blue       1     0     1      0      0
4 B          1 Green      0     0     0      1      1
5 B          2 Purple     1     0     0      0      1
6 B          3 Yellow     1     0     0      1      0

Answer 4

使用 base R，您可以编写一个小函数并使用tapply进行评估：

fun <- function(x) {
  nms <- levels(x)
  tab <- tcrossprod(table(x))
  dimnames(tab) <- list(nms, nms)
  tab[x, ]
}

data.frame(df1, do.call(rbind, with(df1, tapply(factor(Color), Game, fun))), row.names = NULL)
  Game Player  Color Blue Green Purple Red Yellow
1    A      1    Red    1     1      0   1      0
2    A      2  Green    1     1      0   1      0
3    A      3   Blue    1     1      0   1      0
4    B      1  Green    0     1      1   0      1
5    B      2 Purple    0     1      1   0      1
6    B      3 Yellow    0     1      1   0      1

请注意，在给出的所有选项中，这是迄今为止最快的，但仅使用基础 R：

这是基准：

library(microbenchmark)
microbenchmark(Tarjae(df1), akrun(df1), ben(df1), onyambu(df1),
               paulS(df1), unit = 'relative')
Unit: relative
         expr       min       lq      mean    median        uq      max neval
  Tarjae(df1) 18.775201 18.11495 13.533556 17.171485 15.746554 1.105045   100
   akrun(df1)  9.755032  8.83519  7.137294  8.756033  8.241494 1.455906   100
     ben(df1) 21.084371 18.57861 14.699821 17.950987 16.486863 3.124906   100
 onyambu(df1)  1.000000  1.00000  1.000000  1.000000  1.000000 1.000000   100
   paulS(df1) 33.108208 31.27110 24.918541 30.266024 27.420363 3.156215   100

对于较大的数据帧，一些给定的代码会崩溃，而那些没有崩溃的代码对于基本 R 方法仍然很慢：

df2<- transform(data.frame(Game = sample(LETTERS, 2000, TRUE), Color = sample(colors(), 2000, TRUE)), Player = ave(Game, Game, FUN=seq_along))

 microbenchmark(Tarjae(df2), akrun(df2), onyambu(df2), paulS(df2))
Unit: milliseconds
         expr        min        lq      mean     median        uq       max neval
  Tarjae(df2) 2147.67826 2234.5575 2460.1924 2423.20994 2653.1737 3049.9455   100
   akrun(df2)  108.25249  121.3167  144.6715  130.48052  152.9518  404.7286   100
 onyambu(df2)   67.19992   80.3653  111.2821   91.05784  118.4877  331.6724   100
   paulS(df2)  183.88836  200.6224  231.0155  215.18942  237.5717  467.1721   100

基准代码：

Tarjae <- function(df){
  df %>% 
    cbind(abs((model.matrix(~ Color + 0, .) == 1)*1-1)) %>% 
    group_by(Game) %>% 
    mutate(across(-c(Player, Color), ~case_when(sum(.)==3 ~0, 
                                                TRUE ~ .)))
}

akrun <- function(df1){
  map_dfr(split(df1$Color, df1$Game), ~ {
    m1 <- +(outer(.x, .x, FUN = `!=`))
    colnames(m1) <- .x
    as_tibble(m1)}) %>%
    mutate(across(everything(), replace_na, 0)) %>%
    bind_cols(df1, .)
}


ben <- function(df){
  full_join(df, df, by = "Game", suffix = c("", "_Two")) %>%
    filter(Color != Color_Two) %>%
    mutate(val = 1) %>%
    pivot_wider(id_cols = c(Game, Player, Color), 
                names_from = Color_Two, 
                values_from = val,
                values_fill = 0)
}

onyambu <- function(df1){
  fun <- function(x) {
  nms <- levels(x)
  tab <- tcrossprod(table(x))
  dimnames(tab) <- list(nms, nms)
  tab[x, ]
}

data.frame(df1, do.call(rbind, with(df1, tapply(factor(Color), Game, fun))), row.names = NULL)
}


paulS <- function(df){
  df %>% 
    group_by(Game) %>% 
    mutate(aux = list(Color)) %>% 
    unnest(aux) %>% 
    filter(aux != Color) %>% 
    ungroup %>% 
    pivot_wider(Game:Color, names_from = aux, values_from = aux, values_fill = 0, 
                values_fn = length)
  
}

Answer 5

另一种可能的解决方案：

library(tidyverse)

df %>% 
  group_by(Game) %>% 
  mutate(aux = list(Color)) %>% 
  unnest(aux) %>% 
  filter(aux != Color) %>% 
  ungroup %>% 
  pivot_wider(Game:Color, names_from = aux, values_from = aux, values_fill = 0, 
    values_fn = length)

#> # A tibble: 6 × 8
#>   Game  Player Color  Green  Blue   Red Purple Yellow
#>   <chr>  <int> <chr>  <int> <int> <int>  <int>  <int>
#> 1 A          1 Red        1     1     0      0      0
#> 2 A          2 Green      0     1     1      0      0
#> 3 A          3 Blue       1     0     1      0      0
#> 4 B          1 Green      0     0     0      1      1
#> 5 B          2 Purple     1     0     0      0      1
#> 6 B          3 Yellow     1     0     0      1      0

如何创建虚拟变量来指示组内其他观察值的存在因素？

问题描述

5 个解决方案

解决方案1
2 2022-07-17 20:00:07

解决方案2
2 2022-07-17 20:05:51

数据

解决方案3
1 2022-07-17 21:11:17

解决方案4
0 2022-07-17 21:31:50

解决方案5
0 2022-07-17 21:35:28

如何创建虚拟变量来指示组内其他观察值的存在因素？

问题描述

5 个解决方案

解决方案1 2 2022-07-17 20:00:07

解决方案2 2 2022-07-17 20:05:51

数据

解决方案3 1 2022-07-17 21:11:17

解决方案4 0 2022-07-17 21:31:50

解决方案5 0 2022-07-17 21:35:28

解决方案1
2 2022-07-17 20:00:07

解决方案2
2 2022-07-17 20:05:51

解决方案3
1 2022-07-17 21:11:17

解决方案4
0 2022-07-17 21:31:50

解决方案5
0 2022-07-17 21:35:28