![](/img/trans.png)
[英]Is there a function in R tidyverse to categorize character values of a column based on key words and assign a category?
[英]R function categorize by column?
我想编写一个函数,它接受一个数据框,计算多列的出现次数,然后根据列名称的出现为该行分配一个“类别”。
以此df为例:
df <- data.frame(k1 = c(0,0,3,4,5,1),
k2 = c(1,0,0,4,5,0),
k3 = c(0,0,0,8,0,0),
k4 = c(2,5,0,3,4,5))
我希望输出看起来像这样:
df.final<-data.frame(k1 = c(0,0,3,4,5,1),
k2 = c(1,0,0,4,5,0),
k3 = c(0,0,0,8,0,0),
k4 = c(2,5,0,3,4,5),
Category = c("k2_k4","k4","k1","k1_k2_k3_k4","k1_k2_k4","k1_k4"))
当然,我的实际数据是很多很多行,我希望这个函数可以用来评估任意列数的数据帧。 我只是不确定如何编写该函数。 我是一个写新手的功能!
您可以使用data.table::transpose()
函数使每一行成为一个向量,然后使用sapply
循环遍历列表并粘贴值不为零的相应列名:
df$category = sapply(data.table::transpose(df),
function(r) paste0(names(df)[r != 0], collapse = "_"))
df
# k1 k2 k3 k4 category
#1 0 1 0 2 k2_k4
#2 0 0 0 5 k4
#3 3 0 0 0 k1
#4 4 4 8 3 k1_k2_k3_k4
#5 5 5 0 4 k1_k2_k4
#6 1 0 0 5 k1_k4
在基地R,有很多选择。 一:
df$Category <- apply(df > 0, 1, function(x){toString(names(df)[x])})
df
## k1 k2 k3 k4 Category
## 1 0 1 0 2 k2, k4
## 2 0 0 0 5 k4
## 3 3 0 0 0 k1
## 4 4 4 8 3 k1, k2, k3, k4
## 5 5 5 0 4 k1, k2, k4
## 6 1 0 0 5 k1, k4
或使用下划线,
df$Category <- apply(df > 0, 1, function(x){paste(names(df)[x], collapse = '_')})
df
## k1 k2 k3 k4 Category
## 1 0 1 0 2 k2_k4
## 2 0 0 0 5 k4
## 3 3 0 0 0 k1
## 4 4 4 8 3 k1_k2_k3_k4
## 5 5 5 0 4 k1_k2_k4
## 6 1 0 0 5 k1_k4
一种有趣的替代方法是purrr::by_row
:
library(purrr)
df %>% by_row(~toString(names(.)[.x > 0]), .collate = 'cols', .to = 'Category')
## # A tibble: 6 × 5
## k1 k2 k3 k4 Category
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 0 1 0 2 k2, k4
## 2 0 0 0 5 k4
## 3 3 0 0 0 k1
## 4 4 4 8 3 k1, k2, k3, k4
## 5 5 5 0 4 k1, k2, k4
## 6 1 0 0 5 k1, k4
df$Category = paste(ifelse(df$k1>0, 'k1_',''), ifelse(df$k2>0, 'k2_',''), ifelse(df$k3>0, 'k3_',''), ifelse(df$k4>0, 'k4_',''), sep='')
结果:
k1 k2 k3 k4 Category
1 0 1 0 2 k2_k4_
2 0 0 0 5 k4_
3 3 0 0 0 k1_
4 4 4 8 3 k1_k2_k3_k4_
5 5 5 0 4 k1_k2_k4_
6 1 0 0 5 k1_k4_
也许得到更有效的方式。 我太新手了。
使用data.table
:
library(data.table)
setDT(df)
df[ , I := .I]
df[melt(df, id.vars = "I")[value != 0,
paste(variable, collapse = "_"),
keyby = I],
Category := i.V1, on = "I"][]
# k1 k2 k3 k4 I Category
# 1: 0 1 0 2 1 k2_k4
# 2: 0 0 0 5 2 k4
# 3: 3 0 0 0 3 k1
# 4: 4 4 8 3 4 k1_k2_k3_k4
# 5: 5 5 0 4 5 k1_k2_k4
# 6: 1 0 0 5 6 k1_k4
方法草图:
melt
数据 - 让我们在不转换为矩阵的情况下“按行”进行操作 我们可以在base R
中以矢量化形式执行此操作(不使用包)。
df$category <- gsub('^NA_|NA_+|_NA', '', do.call(paste,
c(as.data.frame(`dim<-`(names(df)[(NA^!df)*col(df)], dim(df))), sep="_")))
df$category
#[1] "k2_k4" "k4" "k1" "k1_k2_k3_k4" "k1_k2_k4" "k1_k4"
1)想法是将数据集转换为逻辑向量( !df
- 为0时返回TRUE,其他值返回FALSE)
2)将TRUE值更改为NA( NA^
)
3)然后乘以列索引( col(df)
)
4)使用此索引填充列名称
5)输出是一个vector
,因此我们在分配原始数据集的维度后将其更改为data.frame
6) paste
与行元素do.call(paste
7)最后使用gsub
NA
替换为空白( ""
)
数据集
set.seed(24)
df <- data.frame(k1 = sample(0:5, 1e6, replace=TRUE),
k2 = sample(0:7, 1e6, replace = TRUE),
k3 = sample(0:8, 1e6, replace=TRUE),
k4 = sample(0:4, 1e6, replace = TRUE))
df2 <- copy(df)
setDT(df2)
职能
psidom <- function(){
sapply(data.table::transpose(df),
function(r) paste0(names(df)[r != 0], collapse = "_"))}
akrun <- function(){
gsub('^NA_|NA_+|_NA', '', do.call(paste,
c(as.data.frame(`dim<-`(names(df)[(NA^!df)*col(df)], dim(df))), sep="_")))
}
ae <- function(){
apply(df > 0, 1, function(x){toString(names(df)[x])})}
ae2 <- function(){
df %>%
by_row(~toString(names(.)[.x > 0]),
.collate = 'cols', .to = 'Category')
}
MC <- function(){
df2[ , I := .I]
df2[melt(df2, id.vars = "I")[value != 0,
paste(variable, collapse = "_"),
keyby = I],
Category := i.V1, on = "I"][]
}
Eric <- function() {
paste(ifelse(df$k1>0, 'k1_',''),
ifelse(df$k2>0, 'k2_',''),
ifelse(df$k3>0, 'k3_',''),
ifelse(df$k4>0, 'k4_',''), sep='')
}
基准输出 - system.time
system.time(psidom())
# user system elapsed
# 7.91 0.06 7.97
system.time(ae())
# user system elapsed
# 10.22 0.00 10.22
system.time(ae2())
# user system elapsed
# 100.60 0.27 101.44
system.time(MC())
# user system elapsed
# 4.22 0.03 4.25
system.time(Eric())
# user system elapsed
# 1.40 0.00 1.41
system.time(akrun())
# user system elapsed
# 1.53 0.00 1.53
基准输出 - 微基准测试
library(microbenchmark)
microbenchmark(psidom(), akrun(), ae(), ae2(), MC(), Eric(), unit = "relative",
times = 10)
#Unit: relative
# expr min lq mean median uq max neval
# psidom() 4.0824126 4.1283338 3.9332463 4.237229 3.4060509 4.2147045 10
# akrun() 1.0000000 1.0000000 1.0000000 1.000000 1.0000000 1.0000000 10
# ae() 6.7507093 6.9175511 6.0683960 6.725867 5.1087104 5.1901925 10
# ae2() 62.4294377 61.4709644 53.7637154 59.873279 44.9316386 44.9233634 10
# MC() 3.1439541 3.4666872 3.1479070 3.559120 2.7554062 2.8741309 10
# Eric() 0.9091862 0.9628939 0.9702425 1.042875 0.9878793 0.9686051 10
讨论/评论
@ Eric的方法是最快的,但是当列数更多时,嵌套的ifelse
语句也会更多..
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.