簡體   English   中英

如何創建虛擬變量來指示組內其他觀察值的存在因素?

[英]How to create dummy variables that indicate the presence of a factor for other observations within in a group?

我正在使用如下數據框,其中Color和 `Player 是因子變量:

在此處輸入圖像描述

我想為顏色列的每個值創建指標變量。 但是,我希望這些指示變量表示顏色是否存在於同一游戲中的其他玩家(而不是該玩家是否存在)。 所以我希望上表變成:

在此處輸入圖像描述

我想代碼會有group_by(Game) %>% ,但除此之外我迷路了。

數據:

structure(list(Game = c("A", "A", "A", "B", "B", "B"), Player = c(1L, 
2L, 3L, 1L, 2L, 3L), Color = c("Red", "Green", "Blue", "Green", 
"Purple", "Yellow"), Blue = c(1L, 1L, 0L, 0L, 0L, 0L), Green = c(1L, 
0L, 1L, 0L, 1L, 1L), Yellow = c(0L, 0L, 0L, 1L, 1L, 0L), Red = c(0L, 
1L, 1L, 0L, 0L, 0L), Purple = c(0L, 0L, 0L, 1L, 0L, 1L)), class = "data.frame", row.names = c(NA, 
-6L))

這是我們可以做到的一種方式:

首先,我們使用model.matrix()將其乘以1並在abs()的包裝中減去1 然后我們幾乎得到了所需的輸出,唯一剩下的就是在不存在顏色的情況下獲取零。 我們mutate across...來做到這一點:


library(dplyr)
df %>% 
  cbind(abs((model.matrix(~ Color + 0, .) == 1)*1-1)) %>% 
  group_by(Game) %>% 
  mutate(across(-c(Player, Color), ~case_when(sum(.)==3 ~0, 
                                              TRUE ~ .)))
  Game  Player Color  ColorBlue ColorGreen ColorPurple ColorRed ColorYellow
  <chr>  <int> <chr>      <dbl>      <dbl>       <dbl>    <dbl>       <dbl>
1 A          1 Red            1          1           0        0           0
2 A          2 Green          1          0           0        1           0
3 A          3 Blue           0          1           0        1           0
4 B          1 Green          0          0           1        0           1
5 B          2 Purple         0          1           0        0           1
6 B          3 Yellow         0          1           1        0           0
> 

也許這有幫助 - 通過“游戲”拆分“顏色”列,通過比較“顏色”的元素( != )創建二進制矩陣,轉換為tibble ,行綁定( _dfr )並將數據集與原始數據集綁定( bind_cols

library(purrr)
library(dplyr)
library(tidyr)
map_dfr(split(df1$Color, df1$Game), ~ {
   m1 <- +(outer(.x, .x, FUN = `!=`))
  colnames(m1) <- .x
  as_tibble(m1)}) %>%
  mutate(across(everything(), replace_na, 0)) %>%
  bind_cols(df1, .)

-輸出

  Game Player  Color Red Green Blue Purple Yellow
1    A      1    Red   0     1    1      0      0
2    A      2  Green   1     0    1      0      0
3    A      3   Blue   1     1    0      0      0
4    B      1  Green   0     0    0      1      1
5    B      2 Purple   0     1    0      0      1
6    B      3 Yellow   0     1    0      1      0

或者另一種選擇是使用dummy_cols然后修改輸出

library(fastDummies)
library(stringr)
dummy_cols(df1, 'Color') %>%
  rename_with(~ str_remove(.x, "Color_")) %>%
  group_by(Game) %>% 
  mutate(across(Blue:Yellow, ~ +(Color != cur_column() & any(.x)))) %>%
   ungroup

-輸出

# A tibble: 6 × 8
  Game  Player Color   Blue Green Purple   Red Yellow
  <chr>  <int> <chr>  <int> <int>  <int> <int>  <int>
1 A          1 Red        1     1      0     0      0
2 A          2 Green      1     0      0     1      0
3 A          3 Blue       0     1      0     1      0
4 B          1 Green      0     0      1     0      1
5 B          2 Purple     0     1      0     0      1
6 B          3 Yellow     0     1      1     0      0

數據

df1 <- structure(list(Game = c("A", "A", "A", "B", "B", "B"), Player = c(1L, 
2L, 3L, 1L, 2L, 3L), Color = c("Red", "Green", "Blue", "Green", 
"Purple", "Yellow")), row.names = c(NA, -6L), class = "data.frame")

這是使用tidyverse中的full_joinpivot_wider的另一種方法。 我相信這也給出了相同的結果。 包含filter以避免與 1 相同的顏色指示符。

library(tidyverse)

full_join(df, df, by = "Game", suffix = c("", "_Two")) %>%
  filter(Color != Color_Two) %>%
  mutate(val = 1) %>%
  pivot_wider(id_cols = c(Game, Player, Color), 
              names_from = Color_Two, 
              values_from = val,
              values_fill = 0)

輸出

  Game  Player Color  Green  Blue   Red Purple Yellow
  <chr>  <int> <chr>  <dbl> <dbl> <dbl>  <dbl>  <dbl>
1 A          1 Red        1     1     0      0      0
2 A          2 Green      0     1     1      0      0
3 A          3 Blue       1     0     1      0      0
4 B          1 Green      0     0     0      1      1
5 B          2 Purple     1     0     0      0      1
6 B          3 Yellow     1     0     0      1      0

使用 base R,您可以編寫一個小函數並使用tapply進行評估:

fun <- function(x) {
  nms <- levels(x)
  tab <- tcrossprod(table(x))
  dimnames(tab) <- list(nms, nms)
  tab[x, ]
}

data.frame(df1, do.call(rbind, with(df1, tapply(factor(Color), Game, fun))), row.names = NULL)
  Game Player  Color Blue Green Purple Red Yellow
1    A      1    Red    1     1      0   1      0
2    A      2  Green    1     1      0   1      0
3    A      3   Blue    1     1      0   1      0
4    B      1  Green    0     1      1   0      1
5    B      2 Purple    0     1      1   0      1
6    B      3 Yellow    0     1      1   0      1

請注意,在給出的所有選項中,這是迄今為止最快的,但僅使用基礎 R:

這是基准:

library(microbenchmark)
microbenchmark(Tarjae(df1), akrun(df1), ben(df1), onyambu(df1),
               paulS(df1), unit = 'relative')
Unit: relative
         expr       min       lq      mean    median        uq      max neval
  Tarjae(df1) 18.775201 18.11495 13.533556 17.171485 15.746554 1.105045   100
   akrun(df1)  9.755032  8.83519  7.137294  8.756033  8.241494 1.455906   100
     ben(df1) 21.084371 18.57861 14.699821 17.950987 16.486863 3.124906   100
 onyambu(df1)  1.000000  1.00000  1.000000  1.000000  1.000000 1.000000   100
   paulS(df1) 33.108208 31.27110 24.918541 30.266024 27.420363 3.156215   100

對於較大的數據幀,一些給定的代碼會崩潰,而那些沒有崩潰的代碼對於基本 R 方法仍然很慢:

df2<- transform(data.frame(Game = sample(LETTERS, 2000, TRUE), Color = sample(colors(), 2000, TRUE)), Player = ave(Game, Game, FUN=seq_along))

 microbenchmark(Tarjae(df2), akrun(df2), onyambu(df2), paulS(df2))
Unit: milliseconds
         expr        min        lq      mean     median        uq       max neval
  Tarjae(df2) 2147.67826 2234.5575 2460.1924 2423.20994 2653.1737 3049.9455   100
   akrun(df2)  108.25249  121.3167  144.6715  130.48052  152.9518  404.7286   100
 onyambu(df2)   67.19992   80.3653  111.2821   91.05784  118.4877  331.6724   100
   paulS(df2)  183.88836  200.6224  231.0155  215.18942  237.5717  467.1721   100

基准代碼:

Tarjae <- function(df){
  df %>% 
    cbind(abs((model.matrix(~ Color + 0, .) == 1)*1-1)) %>% 
    group_by(Game) %>% 
    mutate(across(-c(Player, Color), ~case_when(sum(.)==3 ~0, 
                                                TRUE ~ .)))
}

akrun <- function(df1){
  map_dfr(split(df1$Color, df1$Game), ~ {
    m1 <- +(outer(.x, .x, FUN = `!=`))
    colnames(m1) <- .x
    as_tibble(m1)}) %>%
    mutate(across(everything(), replace_na, 0)) %>%
    bind_cols(df1, .)
}


ben <- function(df){
  full_join(df, df, by = "Game", suffix = c("", "_Two")) %>%
    filter(Color != Color_Two) %>%
    mutate(val = 1) %>%
    pivot_wider(id_cols = c(Game, Player, Color), 
                names_from = Color_Two, 
                values_from = val,
                values_fill = 0)
}

onyambu <- function(df1){
  fun <- function(x) {
  nms <- levels(x)
  tab <- tcrossprod(table(x))
  dimnames(tab) <- list(nms, nms)
  tab[x, ]
}

data.frame(df1, do.call(rbind, with(df1, tapply(factor(Color), Game, fun))), row.names = NULL)
}


paulS <- function(df){
  df %>% 
    group_by(Game) %>% 
    mutate(aux = list(Color)) %>% 
    unnest(aux) %>% 
    filter(aux != Color) %>% 
    ungroup %>% 
    pivot_wider(Game:Color, names_from = aux, values_from = aux, values_fill = 0, 
                values_fn = length)
  
}

另一種可能的解決方案:

library(tidyverse)

df %>% 
  group_by(Game) %>% 
  mutate(aux = list(Color)) %>% 
  unnest(aux) %>% 
  filter(aux != Color) %>% 
  ungroup %>% 
  pivot_wider(Game:Color, names_from = aux, values_from = aux, values_fill = 0, 
    values_fn = length)

#> # A tibble: 6 × 8
#>   Game  Player Color  Green  Blue   Red Purple Yellow
#>   <chr>  <int> <chr>  <int> <int> <int>  <int>  <int>
#> 1 A          1 Red        1     1     0      0      0
#> 2 A          2 Green      0     1     1      0      0
#> 3 A          3 Blue       1     0     1      0      0
#> 4 B          1 Green      0     0     0      1      1
#> 5 B          2 Purple     1     0     0      0      1
#> 6 B          3 Yellow     1     0     0      1      0

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM