简体   繁体   中英

How to create dummy variables that indicate the presence of a factor for other observations within in a group?

I am working with a data frame like the following, where Color and `Player are factor variables:

在此处输入图像描述

I want to create indicator variables for each value of the color column. However, I want those indicator variables to represent whether the color is present for other players in the same game (not whether it's present for that player). So I want the above table to turn into:

在此处输入图像描述

I imagine the code will have group_by(Game) %>% , but I'm lost beyond that.

Data:

structure(list(Game = c("A", "A", "A", "B", "B", "B"), Player = c(1L, 
2L, 3L, 1L, 2L, 3L), Color = c("Red", "Green", "Blue", "Green", 
"Purple", "Yellow"), Blue = c(1L, 1L, 0L, 0L, 0L, 0L), Green = c(1L, 
0L, 1L, 0L, 1L, 1L), Yellow = c(0L, 0L, 0L, 1L, 1L, 0L), Red = c(0L, 
1L, 1L, 0L, 0L, 0L), Purple = c(0L, 0L, 0L, 1L, 0L, 1L)), class = "data.frame", row.names = c(NA, 
-6L))

Here is a way how we could do it:

First we use model.matrix() fucntion multiply it by 1 and substract 1 within a wrap of abs() . Then we get almost the desired output, the only thing that is left is the get zeros in case if non of the colors is present. We do this with a mutate across... :


library(dplyr)
df %>% 
  cbind(abs((model.matrix(~ Color + 0, .) == 1)*1-1)) %>% 
  group_by(Game) %>% 
  mutate(across(-c(Player, Color), ~case_when(sum(.)==3 ~0, 
                                              TRUE ~ .)))
  Game  Player Color  ColorBlue ColorGreen ColorPurple ColorRed ColorYellow
  <chr>  <int> <chr>      <dbl>      <dbl>       <dbl>    <dbl>       <dbl>
1 A          1 Red            1          1           0        0           0
2 A          2 Green          1          0           0        1           0
3 A          3 Blue           0          1           0        1           0
4 B          1 Green          0          0           1        0           1
5 B          2 Purple         0          1           0        0           1
6 B          3 Yellow         0          1           1        0           0
> 

Perhaps this helps - split the 'Color' column by 'Game', create a binary matrix by comparing the elements of 'Color' ( != ), convert to tibble , row bind ( _dfr ) and bind the dataset with the original dataset ( bind_cols )

library(purrr)
library(dplyr)
library(tidyr)
map_dfr(split(df1$Color, df1$Game), ~ {
   m1 <- +(outer(.x, .x, FUN = `!=`))
  colnames(m1) <- .x
  as_tibble(m1)}) %>%
  mutate(across(everything(), replace_na, 0)) %>%
  bind_cols(df1, .)

-output

  Game Player  Color Red Green Blue Purple Yellow
1    A      1    Red   0     1    1      0      0
2    A      2  Green   1     0    1      0      0
3    A      3   Blue   1     1    0      0      0
4    B      1  Green   0     0    0      1      1
5    B      2 Purple   0     1    0      0      1
6    B      3 Yellow   0     1    0      1      0

Or another option is with dummy_cols and then modify the output

library(fastDummies)
library(stringr)
dummy_cols(df1, 'Color') %>%
  rename_with(~ str_remove(.x, "Color_")) %>%
  group_by(Game) %>% 
  mutate(across(Blue:Yellow, ~ +(Color != cur_column() & any(.x)))) %>%
   ungroup

-output

# A tibble: 6 × 8
  Game  Player Color   Blue Green Purple   Red Yellow
  <chr>  <int> <chr>  <int> <int>  <int> <int>  <int>
1 A          1 Red        1     1      0     0      0
2 A          2 Green      1     0      0     1      0
3 A          3 Blue       0     1      0     1      0
4 B          1 Green      0     0      1     0      1
5 B          2 Purple     0     1      0     0      1
6 B          3 Yellow     0     1      1     0      0

data

df1 <- structure(list(Game = c("A", "A", "A", "B", "B", "B"), Player = c(1L, 
2L, 3L, 1L, 2L, 3L), Color = c("Red", "Green", "Blue", "Green", 
"Purple", "Yellow")), row.names = c(NA, -6L), class = "data.frame")

Here is another approach using full_join and pivot_wider from tidyverse . I believe this also gives the same result. The filter is included to avoid same color indicators as 1.

library(tidyverse)

full_join(df, df, by = "Game", suffix = c("", "_Two")) %>%
  filter(Color != Color_Two) %>%
  mutate(val = 1) %>%
  pivot_wider(id_cols = c(Game, Player, Color), 
              names_from = Color_Two, 
              values_from = val,
              values_fill = 0)

Output

  Game  Player Color  Green  Blue   Red Purple Yellow
  <chr>  <int> <chr>  <dbl> <dbl> <dbl>  <dbl>  <dbl>
1 A          1 Red        1     1     0      0      0
2 A          2 Green      0     1     1      0      0
3 A          3 Blue       1     0     1      0      0
4 B          1 Green      0     0     0      1      1
5 B          2 Purple     1     0     0      0      1
6 B          3 Yellow     1     0     0      1      0

Using base R, you can write a small function and evaluate using tapply :

fun <- function(x) {
  nms <- levels(x)
  tab <- tcrossprod(table(x))
  dimnames(tab) <- list(nms, nms)
  tab[x, ]
}

data.frame(df1, do.call(rbind, with(df1, tapply(factor(Color), Game, fun))), row.names = NULL)
  Game Player  Color Blue Green Purple Red Yellow
1    A      1    Red    1     1      0   1      0
2    A      2  Green    1     1      0   1      0
3    A      3   Blue    1     1      0   1      0
4    B      1  Green    0     1      1   0      1
5    B      2 Purple    0     1      1   0      1
6    B      3 Yellow    0     1      1   0      1

Note that out of all the options given, This is by far the fastest, yet only using base R:

Here is the benchmark:

library(microbenchmark)
microbenchmark(Tarjae(df1), akrun(df1), ben(df1), onyambu(df1),
               paulS(df1), unit = 'relative')
Unit: relative
         expr       min       lq      mean    median        uq      max neval
  Tarjae(df1) 18.775201 18.11495 13.533556 17.171485 15.746554 1.105045   100
   akrun(df1)  9.755032  8.83519  7.137294  8.756033  8.241494 1.455906   100
     ben(df1) 21.084371 18.57861 14.699821 17.950987 16.486863 3.124906   100
 onyambu(df1)  1.000000  1.00000  1.000000  1.000000  1.000000 1.000000   100
   paulS(df1) 33.108208 31.27110 24.918541 30.266024 27.420363 3.156215   100

For larger dataframes, some of the given code breaks down, while those that dont break down are still slow to the base R approach:

df2<- transform(data.frame(Game = sample(LETTERS, 2000, TRUE), Color = sample(colors(), 2000, TRUE)), Player = ave(Game, Game, FUN=seq_along))

 microbenchmark(Tarjae(df2), akrun(df2), onyambu(df2), paulS(df2))
Unit: milliseconds
         expr        min        lq      mean     median        uq       max neval
  Tarjae(df2) 2147.67826 2234.5575 2460.1924 2423.20994 2653.1737 3049.9455   100
   akrun(df2)  108.25249  121.3167  144.6715  130.48052  152.9518  404.7286   100
 onyambu(df2)   67.19992   80.3653  111.2821   91.05784  118.4877  331.6724   100
   paulS(df2)  183.88836  200.6224  231.0155  215.18942  237.5717  467.1721   100

Code for the benchmark:

Tarjae <- function(df){
  df %>% 
    cbind(abs((model.matrix(~ Color + 0, .) == 1)*1-1)) %>% 
    group_by(Game) %>% 
    mutate(across(-c(Player, Color), ~case_when(sum(.)==3 ~0, 
                                                TRUE ~ .)))
}

akrun <- function(df1){
  map_dfr(split(df1$Color, df1$Game), ~ {
    m1 <- +(outer(.x, .x, FUN = `!=`))
    colnames(m1) <- .x
    as_tibble(m1)}) %>%
    mutate(across(everything(), replace_na, 0)) %>%
    bind_cols(df1, .)
}


ben <- function(df){
  full_join(df, df, by = "Game", suffix = c("", "_Two")) %>%
    filter(Color != Color_Two) %>%
    mutate(val = 1) %>%
    pivot_wider(id_cols = c(Game, Player, Color), 
                names_from = Color_Two, 
                values_from = val,
                values_fill = 0)
}

onyambu <- function(df1){
  fun <- function(x) {
  nms <- levels(x)
  tab <- tcrossprod(table(x))
  dimnames(tab) <- list(nms, nms)
  tab[x, ]
}

data.frame(df1, do.call(rbind, with(df1, tapply(factor(Color), Game, fun))), row.names = NULL)
}


paulS <- function(df){
  df %>% 
    group_by(Game) %>% 
    mutate(aux = list(Color)) %>% 
    unnest(aux) %>% 
    filter(aux != Color) %>% 
    ungroup %>% 
    pivot_wider(Game:Color, names_from = aux, values_from = aux, values_fill = 0, 
                values_fn = length)
  
}

Another possible solution:

library(tidyverse)

df %>% 
  group_by(Game) %>% 
  mutate(aux = list(Color)) %>% 
  unnest(aux) %>% 
  filter(aux != Color) %>% 
  ungroup %>% 
  pivot_wider(Game:Color, names_from = aux, values_from = aux, values_fill = 0, 
    values_fn = length)

#> # A tibble: 6 × 8
#>   Game  Player Color  Green  Blue   Red Purple Yellow
#>   <chr>  <int> <chr>  <int> <int> <int>  <int>  <int>
#> 1 A          1 Red        1     1     0      0      0
#> 2 A          2 Green      0     1     1      0      0
#> 3 A          3 Blue       1     0     1      0      0
#> 4 B          1 Green      0     0     0      1      1
#> 5 B          2 Purple     1     0     0      0      1
#> 6 B          3 Yellow     1     0     0      1      0

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM