将具有伪变量的数据框转换为分类变量

Question

I need to convert dummy into categorical variables. 我需要将虚拟变量转换为分类变量。 Being new to R i just know how to do it the other way round. 对R陌生，我只是反过来知道如何去做。 Can someone point me in the right direction? 有人可以指出我正确的方向吗？

The dataframe is: 数据框为：

data <- data.frame(id=c(1,2,3,4,5,6,7,8,9), 
               red=c("1","0","1","0","1","0","0","0","0"),
               blue=c("1","1","1","1","0","1","1","1","0"),
               yellow=c("0","0","0","0","0","0","0","1","1"))

and expected output is: 预期输出为：

Answer 1

One option with lapply by ignoring the first column ( id ), we check which columns have value 1 in it and replace them with the corresponding column names and others can be changed to NA . 通过忽略第一列（ id ）使用lapply的一个选项，我们检查其中哪些列的值为1，并将它们替换为对应的列名，其他的可以更改为NA 。

data[-1] <- lapply(names(data[-1]), function(x) ifelse(data[x] == 1, x, NA))

data
#  id  red blue yellow
#1  1  red blue   <NA>
#2  2 <NA> blue   <NA>
#3  3  red blue   <NA>
#4  4 <NA> blue   <NA>
#5  5  red <NA>   <NA>
#6  6 <NA> blue   <NA>
#7  7 <NA> blue   <NA>
#8  8 <NA> blue yellow
#9  9 <NA> <NA> yellow

Another approach without using lapply loop 不使用lapply循环的另一种方法

data[-1] <- ifelse(data[-1] == 1, names(data[-1])[col(data[-1])], NA)


data
#  id  red blue yellow
#1  1  red blue   <NA>
#2  2 <NA> blue   <NA>
#3  3  red blue   <NA>
#4  4 <NA> blue   <NA>
#5  5  red <NA>   <NA>
#6  6 <NA> blue   <NA>
#7  7 <NA> blue   <NA>
#8  8 <NA> blue yellow
#9  9 <NA> <NA> yellow

Answer 2

As the 'red', 'blue', 'yellow' columns are factor , we coerce it to numeric and use the index to replace with the corresponding column names within Map 由于'red'，'blue'，'yellow'列是factor ，因此我们将其强制为numeric然后使用索引替换为Map相应的列名

data[-1] <- Map(function(x, y) c('None', y)[as.numeric(x)], 
                  data[-1], names(data)[-1])
names(data)[-1] <- paste0("c", 1:3)
data
#  id  c1   c2     c3
#1  1  red blue   None
#2  2 None blue   None
#3  3  red blue   None
#4  4 None blue   None
#5  5  red None   None
#6  6 None blue   None
#7  7 None blue   None
#8  8 None blue yellow
#9  9 None None yellow

Or another option by changing the levels 或通过更改levels另一种选择

data[-1] <-  Map(function(x, y) {levels(x) <- c('None', y)
                    x},data[-1], names(data)[-1])

Or with lapply , we loop through the sequence of columns, extract the columns, change it to numeric , and use the index to change the values to column names and 'None' 或者使用lapply ，我们遍历列的顺序，提取列，将其更改为numeric ，然后使用索引将值更改为列名和“无”

data[-1] <- lapply(seq_along(data[-1]), function(i) 
         c("None", names(data)[-1][i])[as.numeric(data[-1][[i]])] )

NOTE: Gives the expected output. 注意：提供预期的输出。

Or using a vectorized approach, we create a logical matrix, multiply with column index and change the index to column names 或使用向量化方法，我们创建一个逻辑矩阵，乘以列索引，然后将索引更改为列名

data[-1] <-  `dim<-`(names(data)[-1][col(data[-1]) * 
                       (NA ^(data[-1] == 0))], dim(data[-1]))

Or another option with replace 或另一个replace选项

data[-1] <- replace(as.matrix(data[-1]), data[-1]==1, 
               rep(names(data)[-1], colSums(data[-1] == 1)))

Or using tidyverse 或使用tidyverse

library(tidyverse)
imap(data[-1], ~ c('none', .y)[as.numeric(.x)]) %>%
         bind_cols(data[1], .) %>% 
         rename_at(2:4, ~ paste0("c", 1:3))
#  id   c1   c2     c3
#1  1  red blue   none
#2  2 none blue   none
#3  3  red blue   none
#4  4 none blue   none
#5  5  red none   none
#6  6 none blue   none
#7  7 none blue   none
#8  8 none blue yellow
#9  9 none none yellow

Or with gather/spread 或与gather/spread

data %>% 
    gather(key, val, -id) %>% 
    mutate(val = case_when(val == 1 ~ key), 
    key = factor(key, levels = unique(key), labels = paste0("c", 1:3))) %>% 
    spread(key, val)

Benchmarks 基准测试

Here are some benchmarks 这是一些基准

data1 <- data[rep(seq_len(nrow(data)), 1e5),]
system.time({
  Map(function(x, y) c('None', y)[as.numeric(x)], 
                   data1[-1], names(data1)[-1])

 })
#   user  system elapsed 
#  0.065   0.014   0.078 


system.time({
 `dim<-`(names(data1)[-1][col(data1[-1]) * 
                       (NA ^(data1[-1] == 0))], dim(data1[-1]))

})
# user  system elapsed 
#  0.387   0.036   0.422 

system.time({
   imap(data1[-1], ~ c('none', .y)[as.numeric(.x)]) 
})
# user  system elapsed 
# 0.047   0.006   0.054 


 system.time({
   lapply(names(data1[-1]), function(x) ifelse(data1[x] == 1, x, NA))
  }
  )
 # user  system elapsed 
 #  0.555   0.067   0.621 


system.time({
    ifelse(data1[-1] == 1, names(data1[-1])[col(data1[-1])], NA)

  })
# user  system elapsed 
#  0.711   0.060   0.770

On a 1e6 dataset 在1e6数据集上

data1 <- data[rep(seq_len(nrow(data)), 1e6),] 
system.time({Map( function(x, y) {levels(x) <- c('None', y)
              x},data1[-1], names(data1)[-1])})
#   user  system elapsed 
#    0.123   0.016   0.139 

system.time({
   Map(function(x, y) c('None', y)[as.numeric(x)], 
                   data1[-1], names(data1)[-1])

  })
#   user  system elapsed 
#  0.328   0.074   0.402 
 system.time({
    lapply(names(data1[-1]), function(x) ifelse(data1[x] == 1, x, NA))
   }
   )
#   user  system elapsed 
#  7.125   0.463   7.561

with microbenchmark 带有微基准

library(microbenchmark)
microbenchmark(ak = Map(function(x, y) c('None', y)[as.numeric(x)], 
                 data1[-1], names(data1)[-1]), 
         ak2 = Map( function(x, y) {levels(x) <- c('None', y); x},data1[-1], names(data1)[-1]),
          rs = lapply(names(data1[-1]), function(x) ifelse(data1[x] == 1, x, NA)), unit = 'relative', times = 10L)
#Unit: relative
#expr      min        lq      mean    median       uq      max nev 
#ak  6.14964  4.048205  2.401768  1.741373  2.47268  2.43698    10
#ak2  1.00000  1.000000  1.000000  1.000000  1.00000  1.00000    10
#rs 70.73601 45.468868 23.020272 20.408306 18.63263 16.01278    10

data 数据

data <- structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9), red = structure(c(2L, 
1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"), 
blue = structure(c(2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L), .Label = c("0", 
"1"), class = "factor"), yellow = structure(c(1L, 1L, 1L, 
1L, 1L, 1L, 1L, 2L, 2L), .Label = c("0", "1"), class = "factor")), 
 class = "data.frame", row.names = c(NA, 
-9L))

Answer 3

This perhaps? 这也许吗？

library(tidyverse)

data <- data %>%
  mutate(red = case_when(
    red == 1 ~ "Red", red == 0 ~ "None")) %>% 
  mutate(blue = case_when(
    blue == 1 ~ "Blue", blue == 0 ~ "None")) %>% 
  mutate(yellow = case_when(
    yellow == 1 ~ "Yellow", yellow == 0 ~ "None"
))

data

将具有伪变量的数据框转换为分类变量

问题描述

3 个解决方案

解决方案1
4 已采纳 2018-09-03 03:31:41

解决方案2
1 2018-09-03 03:28:47

Benchmarks 基准测试

data 数据

解决方案3
0 2018-09-03 03:42:52

将具有伪变量的数据框转换为分类变量

问题描述

3 个解决方案

解决方案1 4 已采纳 2018-09-03 03:31:41

解决方案2 1 2018-09-03 03:28:47

Benchmarks 基准测试

data 数据

解决方案3 0 2018-09-03 03:42:52

解决方案1
4 已采纳 2018-09-03 03:31:41

解决方案2
1 2018-09-03 03:28:47

解决方案3
0 2018-09-03 03:42:52