[英]How do I create a binary column based off characters in another column in R?
ROW ID SEX RACE
2 REC1000023 F 1.Black
7 REC1000032 M 6.White
8 REC1000066 M 4.Asian
9 REC1000078 M 6.White
10 REC1000099 M 5.Multiracial
我想創建一個二進制變量“Black”,並根據“RACE”列中的值將其設為 0 或 1。 我還想要一個“白色”列和一個“其他”列。 像這樣:
ROW ID SEX RACE Black White Other
2 REC1000023 F 1.Black 1 0 0
7 REC1000032 M 6.White 0 1 0
8 REC1000066 M 4.Asian 0 0 1
9 REC1000078 M 6.White 0 1 0
10 REC1000099 M 5.Multiracial 0 0 1
如果黑色始終編碼為1.Black
和White始終編碼為6.White
您可以使用==
進行比較,並使用+
將 TRUE/FALSE 向量設為 1/0:
df$Black <- +(df$RACE == "1.Black")
df$White <- +(df$RACE == "6.White")
如果其他字符發生變化,那么您可以使用grepl
:
df$Black <- +grepl("Black", df$RACE, fixed = TRUE)
df$White <- +grepl("White", df$RACE, fixed = TRUE)
要獲得剩余的列Other只需使用Black
and White
中已有的內容:
df$Other <- 1 - (df$Black | df$White)
結果:
df
# ROW ID SEX RACE Black White Other
#1 2 REC1000023 F 1.Black 1 0 0
#2 7 REC1000032 M 6.White 0 1 0
#3 8 REC1000066 M 4.Asian 0 0 1
#4 9 REC1000078 M 6.White 0 1 0
#5 10 REC1000099 M 5.Multiracial 0 0 1
這是否有效:
library(dplyr)
library(stringr)
df %>% mutate(Black = +str_detect(RACE,'Black'),
White = +str_detect(RACE,'White'),
Other = +(!str_detect(RACE,'Black|White')))
# A tibble: 5 x 7
ROW ID SEX RACE Black White Other
<dbl> <chr> <chr> <chr> <int> <int> <int>
1 2 REC1000023 F 1.Black 1 0 0
2 7 REC1000032 M 6.White 0 1 0
3 8 REC1000066 M 4.Asian 0 0 1
4 9 REC1000078 M 6.White 0 1 0
5 10 REC1000099 M 5.Multiracial 0 0 1
創建一個新列,其中除c('Black', 'White')
之外的任何值都更改為'Other'
並使用pivot_wider
。
library(dplyr)
library(tidyr)
df %>%
mutate(col = sub('\\d+\\.', '', RACE),
col = replace(col, !col %in% c('Black', 'White'), 'Other')) %>%
pivot_wider(names_from = col, values_from = col,
values_fn = length, values_fill = 0)
# ROW ID SEX RACE Black White Other
# <int> <chr> <chr> <chr> <int> <int> <int>
#1 2 REC1000023 F 1.Black 1 0 0
#2 7 REC1000032 M 6.White 0 1 0
#3 8 REC1000066 M 4.Asian 0 0 1
#4 9 REC1000078 M 6.White 0 1 0
#5 10 REC1000099 M 5.Multiracial 0 0 1
使用 ifelse:
library(tidyverse)
# Example data
df <- data.frame(
stringsAsFactors = FALSE,
ROW = c(2L, 7L, 8L, 9L, 10L),
ID = c("REC1000023","REC1000032",
"REC1000066","REC1000078","REC1000099"),
SEX = c("F", "M", "M", "M", "M"),
RACE = c("1.Black","6.White","4.Asian",
"6.White","5.Multiracial")
)
# Create new columns
df2 <- df %>%
mutate(Black = ifelse(RACE == "1.Black", 1, 0),
White = ifelse(RACE == "6.White", 1, 0),
Other = ifelse(RACE != "1.Black" & RACE != "6.White", 1, 0))
df2
# ROW ID SEX RACE Black White Other
#1 2 REC1000023 F 1.Black 1 0 0
#2 7 REC1000032 M 6.White 0 1 0
#3 8 REC1000066 M 4.Asian 0 0 1
#4 9 REC1000078 M 6.White 0 1 0
#5 10 REC1000099 M 5.Multiracial 0 0 1
--
不確定速度是否是您的應用程序中的一個因素,但這是使用示例數據集的基准:
ronak_func <- function(df){
df %>%
mutate(col = sub('\\d+\\.', '', RACE),
col = replace(col, !col %in% c('Black', 'White'), 'Other')) %>%
pivot_wider(names_from = col, values_from = col,
values_fn = length, values_fill = 0)
}
jared_func <- function(df){
df %>%
mutate(Black = ifelse(RACE == "1.Black", 1, 0),
White = ifelse(RACE == "6.White", 1, 0),
Other = ifelse(RACE != "1.Black" & RACE != "6.White", 1, 0))
}
karthik_func <- function(df){
df %>% mutate(Black = +str_detect(RACE,'Black'),
White = +str_detect(RACE,'White'),
Other = +(!str_detect(RACE,'Black|White')))
}
jpdugo17_func <- function(df){
map_dfc(list('1.Black', '6.White'), ~ transmute(df, '{str_sub(.x, 3, -1)}' := if_else(RACE == .x, 1, 0))) %>%
mutate(other = if_else(Black + White == 1, 0, 1)) %>% cbind(df, .)
}
GKi1_func <- function(df) {
df$Black <- +(df$RACE == "1.Black")
df$White <- +(df$RACE == "6.White")
df$Other <- 1 - (df$Black | df$White)
df
}
GKi2_func <- function(df) {
df$Black <- +grepl("Black", df$RACE, fixed = TRUE)
df$White <- +grepl("White", df$RACE, fixed = TRUE)
df$Other <- 1 - (df$Black | df$White)
df
}
jared_func_dt <- function(df){
setDT(df)
df[, Black := +(df$RACE == "1.Black")][, White := +(df$RACE == "6.White")][, Other := 1 - (df$Black | df$White)]
}
res <- microbenchmark::microbenchmark(ronak_func(df),
jared_func(df),
karthik_func(df),
jpdugo17_func(df),
GKi1_func(df),
GKi2_func(df),
jared_func_dt(df))
autoplot(res)
以及使用具有 10k 行的示例數據集的基准測試:
df2 <- data.frame(stringsAsFactors = FALSE,
ROW = 1:10000,
ID = rep(c("REC1000023","REC1000032",
"REC1000066","REC1000078",
"REC1000099"), times = 2000),
SEX = sample(c("F", "M"),
replace = TRUE,
size = 10000),
RACE = sample(c("1.Black","6.White","4.Asian",
"6.White","5.Multiracial"),
replace = TRUE,
size = 10000))
res <- microbenchmark::microbenchmark(ronak_func(df2),
jared_func(df2),
karthik_func(df2),
jpdugo17_func(df2),
GKi1_func(df2),
GKi2_func(df2),
jared_func_dt(df2))
autoplot(res)
library(tidyverse)
df <-
read_table(file = "ROW ID SEX RACE
2 REC1000023 F 1.Black
7 REC1000032 M 6.White
8 REC1000066 M 4.Asian
9 REC1000078 M 6.White
10 REC1000099 M 5.Multiracial ")
map_dfc(list('1.Black', '6.White'), ~ transmute(df, '{str_sub(.x, 3, -1)}' := if_else(RACE == .x, 1, 0))) %>%
mutate(other = if_else(Black + White == 1, 0, 1)) %>% cbind(df, .)
#> ROW ID SEX RACE Black White other
#> 1 2 REC1000023 F 1.Black 1 0 0
#> 2 7 REC1000032 M 6.White 0 1 0
#> 3 8 REC1000066 M 4.Asian 0 0 1
#> 4 9 REC1000078 M 6.White 0 1 0
#> 5 10 REC1000099 M 5.Multiracial 0 0 1
由代表 package (v2.0.0) 於 2021 年 6 月 16 日創建
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.