[英]summarize dataframe on multiple variable
我有下面提到的数据帧:
ID Date Status Category
TR-1 2018-01-10 Passed A
TR-2 2018-01-09 Passed B
TR-3 2018-01-09 Failed C
TR-3 2018-01-09 Failed A
TR-4 2018-01-08 Failed B
TR-5 2018-01-08 Passed C
TR-5 2018-01-08 Failed A
TR-6 2018-01-07 Passed A
通过使用上面给出的数据帧,我想要一个输出格式,如下所示:
Date
应按降序排列,类别序列应为C,A和B.
Date count distinct_count Passed Failed
2018-01-10 1 1 1 0
A 1 1 1 0
B 0 0 0 0
C 0 0 0 0
2018-01-09 3 2 1 2
A 1 1 1 0
B 1 1 1 0
C 1 1 1 0
为了得到上面的输出,我尝试了下面的代码,但它无法工作,无法获得预期的输出。
Output<-DF %>%
group_by(Date=Date,A,B,C) %>%
summarise(`Count` = n(),
`Distinct_count` = n_distinct(ID),
Passed=sum(Status=='Passed'),
A=count(category='A'),
B=count(category='B'),
C=count(category='C'),
Failed=sum(Status=='Failed'))
Dput:
structure(list(ID = structure(c(1L, 2L, 3L, 3L, 4L, 5L, 5L, 6L
), .Label = c("TR-1", "TR-2", "TR-3", "TR-4", "TR-5", "TR-6"), class = "factor"),
Date = structure(c(4L, 3L, 3L, 3L, 2L, 2L, 2L, 1L), .Label = c("07/01/2018",
"08/01/2018", "09/01/2018", "10/01/2018"), class = "factor"),
Status = structure(c(2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L), .Label = c("Failed",
"Passed"), class = "factor"), Category = structure(c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 1L), .Label = c("A", "B", "C"), class = "factor")), .Names = c("ID",
"Date", "Status", "Category"), class = "data.frame", row.names = c(NA,
-8L))
这是一个艰难的:
# I'm converting some variables to factors to get the "order" right and to fill in missing unobserved values later in dcast.
df1$Category <- factor(df1$Category, levels = unique(df1$Category))
date_lvls <- as.Date(df1$Date, "%Y-%m-%d") %>% unique %>% sort(decreasing = TRUE) %>% as.character
df1$Date <- factor(df1$Date, date_lvls)
# lets use data.table
library(data.table)
setDT(df1)
# make a lookup table to deal with the duplicated ID issue. Not sure how to do this elegant
tmp <- dcast.data.table(df1, Date ~ ID, fun.aggregate = length)
tmp <- structure(rowSums(tmp[,-1] == 2), .Names = as.character(unlist(tmp[, 1])))
# precaution! Boilerplate incoming in 3, 2, .. 1
dcast.data.table(df1, Date + Category ~ Status, drop = FALSE)[
,`:=`(Failed=+!is.na(Failed), Passed=+!is.na(Passed))][
, c("count","distinct_count") := rowSums(cbind(Failed,Passed))][
, Category := as.character(Category)][
, rbind(
cbind(Category = as.character(Date[1]), count = sum(count), distinct_count = sum(distinct_count) - tmp[as.character(Date[1])], Passed = sum(Passed), Failed = sum(Failed)),
.SD
, fill = TRUE), by = Date][
, Date := NULL ][]
结果:
# Category count distinct_count Passed Failed
#1: 2018-01-10 1 1 1 0
#2: A 1 1 1 0
#3: B 0 0 0 0
#4: C 0 0 0 0
#5: 2018-01-09 3 2 1 2
#6: A 1 1 0 1
#7: B 1 1 1 0
#8: C 1 1 0 1
#9: 2018-01-08 3 2 1 2
#10: A 1 1 0 1
#11: B 1 1 0 1
#12: C 1 1 1 0
#13: 2018-01-07 1 1 1 0
#14: A 1 1 1 0
#15: B 0 0 0 0
#16: C 0 0 0 0
数据:
df1<-
structure(list(ID = c("TR-1", "TR-2", "TR-3", "TR-3", "TR-4",
"TR-5", "TR-5", "TR-6"), Date = c("2018-01-10", "2018-01-09",
"2018-01-09", "2018-01-09", "2018-01-08", "2018-01-08", "2018-01-08",
"2018-01-07"), Status = c("Passed", "Passed", "Failed", "Failed",
"Failed", "Passed", "Failed", "Passed"), Category = c("A", "B",
"C", "A", "B", "C", "A", "A")), row.names = c(NA, -8L), class = "data.frame")
请注意:
请一个接一个地运行每一行代码。 为此,您可以关闭每个ENDING打开括号并运行该行直到结束:例如
run: dcast.data.table(df1, Date + Category ~ Status, drop = FALSE)[]
run: dcast.data.table(df1, Date + Category ~ Status, drop = FALSE)[ ,
:= (Failed=+!is.na(Failed), Passed=+!is.na(Passed))][]
... 直至最后
如果有什么不清楚请问我这个具体的事情。
我确信必须有一个更优雅的解决方案,但使用tidyverse
你可以做到:
bind_rows(df %>%
arrange(Date) %>%
group_by(Date, Category) %>%
summarise(count = n(),
distinct_count = n_distinct(ID),
passed = length(Status[Status == "Passed"]),
failed = length(Status[Status == "Failed"])) %>%
complete(Category) %>%
mutate_all(funs(coalesce(., 0L))) %>%
ungroup() %>%
mutate(Date = Category,
date_id = gl(nrow(.)/3, 3)) %>%
select(-Category), df %>%
arrange(Date) %>%
group_by(Date) %>%
summarise(count = n(),
distinct_count = n_distinct(ID),
passed = length(Status[Status == "Passed"]),
failed = length(Status[Status == "Failed"])) %>%
mutate(date_id = gl(nrow(.), 1))) %>%
arrange(date_id, Date)
Date count distinct_count passed failed date_id
<chr> <int> <int> <int> <int> <fct>
1 07/01/2018 1 1 1 0 1
2 A 1 1 1 0 1
3 B 0 0 0 0 1
4 C 0 0 0 0 1
5 08/01/2018 3 2 1 2 2
6 A 1 1 0 1 2
7 B 1 1 0 1 2
8 C 1 1 1 0 2
9 09/01/2018 3 2 1 2 3
10 A 1 1 0 1 3
11 B 1 1 1 0 3
12 C 1 1 0 1 3
13 10/01/2018 1 1 1 0 4
14 A 1 1 1 0 4
15 B 0 0 0 0 4
16 C 0 0 0 0 4
首先,它根据“Date”和“Category”创建带有count,distinct_count,传递和失败列的df。 其次,通过使用complete()
它会在“Category”中生成所有级别,然后coalesce()
用0填充不存在的级别。第三,它使用count,distinct_count,传递和失败的列创建第二个df就在“日期”。 最后,它按行组合了两个dfs。
样本数据:
df <- read.table(text = "ID Date Status Category
TR-1 2018-01-10 Passed A
TR-2 2018-01-09 Passed B
TR-3 2018-01-09 Failed C
TR-3 2018-01-09 Failed A
TR-4 2018-01-08 Failed B
TR-5 2018-01-08 Passed C
TR-5 2018-01-08 Failed A
TR-6 2018-01-07 Passed A", header = TRUE)
在同一列中混合使用$Date
和$Category
等变量是一个坏主意,因为正如@Luminata所指出的那样,进一步处理数据非常困难。
虽然目前还不清楚你想要达到什么目标,因此任何答案都必须是试探性的,这里有一个可以让你更接近目标的解决方案:
如果这是您的数据:
df <- data.frame(
ID = c("TR-1","TR-2", "TR-3", "TR-3", "TR-4", "TR-5", "TR-5", "TR-6"),
Date = c("2018-01-10", "2018-01-09", "2018-01-09", "2018-01-09", "2018-01-08", "2018-01-08", "2018-01-08", "2018-01-07"),
Status = c("Passed","Passed","Failed","Failed","Failed","Passed","Failed", "Passed"),
Category = c("A","B","C","A","B","C","A","A")
)
你想要的是将数据分开$Date
,那么为什么不使用by
和unique
函数为每个日期创建一个可分离的数据框列表:
df_list <- by(df, df$Date, function(unique) unique)
df_list
df$Date: 2018-01-07
ID Date Status Category
8 TR-6 2018-01-07 Passed A
------------------------------------------------------------------------------------------
df$Date: 2018-01-08
ID Date Status Category
5 TR-4 2018-01-08 Failed B
6 TR-5 2018-01-08 Passed C
7 TR-5 2018-01-08 Failed A
------------------------------------------------------------------------------------------
df$Date: 2018-01-09
ID Date Status Category
2 TR-2 2018-01-09 Passed B
3 TR-3 2018-01-09 Failed C
4 TR-3 2018-01-09 Failed A
------------------------------------------------------------------------------------------
df$Date: 2018-01-10
ID Date Status Category
1 TR-1 2018-01-10 Passed A
您可以在要使用的两列的不同级别上混合使用lapply
,并将其与do.call("rbind",x)
,以将其作为数组返回。
像这样的东西:
res=do.call("rbind",lapply(levels(DF$Date),function(d)do.call("rbind",lapply(levels(DF$Category),function(c)
{
tbl=table(DF$Status[DF$Category == c & DF$Date == d])
cbind(Date=d,Category=c,count=sum(tbl),distinct_count=sum(tbl>0),t(tbl))
}))))
res=as.data.frame(res)
我在数据集中添加了几行,因此输入框应为:
DF <- read.table(text =
"fD Date Status Category
TR-1 2018-01-10 Passed A
TR-2 2018-01-09 Passed B
TR-3 2018-01-09 Failed C
TR-4 2018-01-09 Failed A
TR-5 2018-01-08 Failed B
TR-6 2018-01-08 Passed C
TR-7 2018-01-08 Failed A
TR-8 2018-01-08 Passed B
TR-9 2018-01-08 Failed A
TR-10 2018-01-08 Failed A
TR-11 2018-01-07 Passed A"
, header = TRUE)
然后输出第一行代码:
> res
Date Category count distinct_count Failed Passed
1 2018-01-07 A 1 1 0 1
2 2018-01-07 B 0 0 0 0
3 2018-01-07 C 0 0 0 0
4 2018-01-08 A 3 1 3 0
5 2018-01-08 B 2 2 1 1
6 2018-01-08 C 1 1 0 1
7 2018-01-09 A 1 1 1 0
8 2018-01-09 B 1 1 0 1
9 2018-01-09 C 1 1 1 0
10 2018-01-10 A 1 1 0 1
11 2018-01-10 B 0 0 0 0
12 2018-01-10 C 0 0 0 0
编辑:我想我终于猜到你的意思是“非常统计”,所以我更新了答案。
正如其他人所指出的那样,将变量混合在一列可能不是最好的主意,但我之前只是通过组合行来完成它:
library(tidyr)
library(dplyr)
Output <- DF %>%
group_by(Date, Category) %>%
summarise('Count'=n(),
'Distinct_Count'=n_distinct(ID),
Passed=sum(Status=='Passed'),
Failed=sum(Status=='Failed')) %>%
ungroup() %>%
complete(Date, Category, fill=list(Count=0, Distinct_Count=0, Passed=0, Failed=0))
perDay <- Output %>%
group_by(Date) %>%
summarise('Count'=sum(Count),
'Distinct_Count'=sum(Distinct_Count),
Passed=sum(Passed),
Failed=sum(Failed)) %>%
arrange(desc(Date))
Output$indate <- Output$Date
Output$Date <- Output$Category
Combined <- bind_rows(lapply(perDay$Date, function(date) {
rbind(perDay[perDay$Date==date,], Output[Output$indate==date,c(1,3:6)])
}))
每个类别的data.frames perDay和Output计数值(必要时填写它们),仅在以后才会每天绑定在一起。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.