簡體   English   中英

如何根據 R 中另一列中的字符創建二進制列?

[英]How do I create a binary column based off characters in another column in R?

ROW   ID       SEX               RACE               
2  REC1000023   F                1.Black
7  REC1000032   M                6.White
8  REC1000066   M                4.Asian
9  REC1000078   M                6.White
10 REC1000099   M                5.Multiracial 

我想創建一個二進制變量“Black”,並根據“RACE”列中的值將其設為 0 或 1。 我還想要一個“白色”列和一個“其他”列。 像這樣:

ROW   ID       SEX               RACE           Black   White  Other        
2  REC1000023   F                1.Black         1      0      0
7  REC1000032   M                6.White         0      1      0
8  REC1000066   M                4.Asian         0      0      1
9  REC1000078   M                6.White         0      1      0
10 REC1000099   M                5.Multiracial   0      0      1

如果黑色始終編碼為1.BlackWhite始終編碼為6.White您可以使用==進行比較,並使用+將 TRUE/FALSE 向量設為 1/0:

df$Black <- +(df$RACE == "1.Black")
df$White <- +(df$RACE == "6.White")

如果其他字符發生變化,那么您可以使用grepl

df$Black <- +grepl("Black", df$RACE, fixed = TRUE)
df$White <- +grepl("White", df$RACE, fixed = TRUE)

要獲得剩余的列Other只需使用Black and White中已有的內容:

df$Other <- 1 - (df$Black | df$White)

結果:

df
#  ROW         ID SEX          RACE Black White Other
#1   2 REC1000023   F       1.Black     1     0     0
#2   7 REC1000032   M       6.White     0     1     0
#3   8 REC1000066   M       4.Asian     0     0     1
#4   9 REC1000078   M       6.White     0     1     0
#5  10 REC1000099   M 5.Multiracial     0     0     1

這是否有效:

library(dplyr)
library(stringr)
df %>% mutate(Black = +str_detect(RACE,'Black'),
              White = +str_detect(RACE,'White'),
              Other = +(!str_detect(RACE,'Black|White')))
# A tibble: 5 x 7
    ROW ID         SEX   RACE          Black White Other
  <dbl> <chr>      <chr> <chr>         <int> <int> <int>
1     2 REC1000023 F     1.Black           1     0     0
2     7 REC1000032 M     6.White           0     1     0
3     8 REC1000066 M     4.Asian           0     0     1
4     9 REC1000078 M     6.White           0     1     0
5    10 REC1000099 M     5.Multiracial     0     0     1

創建一個新列,其中除c('Black', 'White')之外的任何值都更改為'Other'並使用pivot_wider

library(dplyr)
library(tidyr)

df %>%
  mutate(col = sub('\\d+\\.', '', RACE), 
         col = replace(col, !col %in% c('Black', 'White'), 'Other')) %>%
  pivot_wider(names_from = col, values_from = col, 
              values_fn = length, values_fill = 0)

#    ROW ID         SEX   RACE          Black White Other
#  <int> <chr>      <chr> <chr>         <int> <int> <int>
#1     2 REC1000023 F     1.Black           1     0     0
#2     7 REC1000032 M     6.White           0     1     0
#3     8 REC1000066 M     4.Asian           0     0     1
#4     9 REC1000078 M     6.White           0     1     0
#5    10 REC1000099 M     5.Multiracial     0     0     1

使用 ifelse:

library(tidyverse)
# Example data
df <- data.frame(
  stringsAsFactors = FALSE,
               ROW = c(2L, 7L, 8L, 9L, 10L),
                ID = c("REC1000023","REC1000032",
                       "REC1000066","REC1000078","REC1000099"),
               SEX = c("F", "M", "M", "M", "M"),
              RACE = c("1.Black","6.White","4.Asian",
                       "6.White","5.Multiracial")
)

# Create new columns
df2 <- df %>% 
  mutate(Black = ifelse(RACE == "1.Black", 1, 0),
         White = ifelse(RACE == "6.White", 1, 0),
         Other = ifelse(RACE != "1.Black" & RACE != "6.White", 1, 0))
df2
#  ROW         ID SEX          RACE Black White Other
#1   2 REC1000023   F       1.Black     1     0     0
#2   7 REC1000032   M       6.White     0     1     0
#3   8 REC1000066   M       4.Asian     0     0     1
#4   9 REC1000078   M       6.White     0     1     0
#5  10 REC1000099   M 5.Multiracial     0     0     1

--

不確定速度是否是您的應用程序中的一個因素,但這是使用示例數據集的基准:

ronak_func <- function(df){
  df %>%
    mutate(col = sub('\\d+\\.', '', RACE), 
           col = replace(col, !col %in% c('Black', 'White'), 'Other')) %>%
    pivot_wider(names_from = col, values_from = col, 
                values_fn = length, values_fill = 0)
}

jared_func <- function(df){
  df %>% 
    mutate(Black = ifelse(RACE == "1.Black", 1, 0),
           White = ifelse(RACE == "6.White", 1, 0),
           Other = ifelse(RACE != "1.Black" & RACE != "6.White", 1, 0))
}

karthik_func <- function(df){
  df %>% mutate(Black = +str_detect(RACE,'Black'),
                White = +str_detect(RACE,'White'),
                Other = +(!str_detect(RACE,'Black|White')))
}

jpdugo17_func <- function(df){
  map_dfc(list('1.Black', '6.White'), ~ transmute(df, '{str_sub(.x, 3, -1)}' := if_else(RACE == .x, 1, 0))) %>% 
    mutate(other = if_else(Black + White == 1, 0, 1)) %>% cbind(df, .)
}

GKi1_func <- function(df) {
  df$Black <- +(df$RACE == "1.Black")
  df$White <- +(df$RACE == "6.White")
  df$Other <- 1 - (df$Black | df$White)
  df
}

GKi2_func <- function(df) {
  df$Black <- +grepl("Black", df$RACE, fixed = TRUE)
  df$White <- +grepl("White", df$RACE, fixed = TRUE)
  df$Other <- 1 - (df$Black | df$White)
  df
}

jared_func_dt <- function(df){
  setDT(df)
  df[, Black := +(df$RACE == "1.Black")][, White := +(df$RACE == "6.White")][, Other :=  1 - (df$Black | df$White)]
}

res <- microbenchmark::microbenchmark(ronak_func(df),
                                      jared_func(df),
                                      karthik_func(df),
                                      jpdugo17_func(df),
                                      GKi1_func(df),
                                      GKi2_func(df),
                                      jared_func_dt(df))
autoplot(res)

example_4.png

以及使用具有 10k 行的示例數據集的基准測試:

df2 <- data.frame(stringsAsFactors = FALSE,
                  ROW = 1:10000,
                  ID = rep(c("REC1000023","REC1000032",
                             "REC1000066","REC1000078",
                             "REC1000099"), times = 2000),
                  SEX = sample(c("F", "M"),
                               replace = TRUE,
                               size = 10000),
                  RACE = sample(c("1.Black","6.White","4.Asian",
                           "6.White","5.Multiracial"),
                           replace = TRUE,
                           size = 10000))
res <- microbenchmark::microbenchmark(ronak_func(df2),
                                      jared_func(df2),
                                      karthik_func(df2),
                                      jpdugo17_func(df2),
                                      GKi1_func(df2),
                                      GKi2_func(df2),
                                      jared_func_dt(df2))
autoplot(res)

示例_3.png

library(tidyverse)
df <- 
read_table(file = "ROW   ID       SEX               RACE               
2  REC1000023   F                1.Black
7  REC1000032   M                6.White
8  REC1000066   M                4.Asian
9  REC1000078   M                6.White
10 REC1000099   M                5.Multiracial ")

map_dfc(list('1.Black', '6.White'), ~ transmute(df, '{str_sub(.x, 3, -1)}' := if_else(RACE == .x, 1, 0))) %>% 
    mutate(other = if_else(Black + White == 1, 0, 1)) %>% cbind(df, .)
#>        ROW   ID SEX          RACE Black White other
#> 1 2  REC1000023   F       1.Black     1     0     0
#> 2 7  REC1000032   M       6.White     0     1     0
#> 3 8  REC1000066   M       4.Asian     0     0     1
#> 4 9  REC1000078   M       6.White     0     1     0
#> 5 10 REC1000099   M 5.Multiracial     0     0     1

代表 package (v2.0.0) 於 2021 年 6 月 16 日創建

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM