简体   繁体   中英

summarize dataframe on multiple variable

I have below mentioned dataframe:

ID        Date            Status         Category
TR-1      2018-01-10      Passed         A
TR-2      2018-01-09      Passed         B
TR-3      2018-01-09      Failed         C
TR-3      2018-01-09      Failed         A
TR-4      2018-01-08      Failed         B
TR-5      2018-01-08      Passed         C
TR-5      2018-01-08      Failed         A
TR-6      2018-01-07      Passed         A

By utilizing the above given dataframe I want a output format as shown below:

The Date should be in descending order and the category sequence should be like C, A and B.

Date         count      distinct_count      Passed     Failed
2018-01-10   1          1                   1          0
    A        1          1                   1          0
    B        0          0                   0          0
    C        0          0                   0          0
2018-01-09   3          2                   1          2
    A        1          1                   1          0
    B        1          1                   1          0
    C        1          1                   1          0

To derive the above output, I have tried below code but it couldn't work and not able to get expected output.

Output<-DF %>%
  group_by(Date=Date,A,B,C) %>%
  summarise(`Count`  = n(),
            `Distinct_count` = n_distinct(ID),
            Passed=sum(Status=='Passed'),
            A=count(category='A'),
            B=count(category='B'),
            C=count(category='C'),
            Failed=sum(Status=='Failed'))

Dput:

structure(list(ID = structure(c(1L, 2L, 3L, 3L, 4L, 5L, 5L, 6L
), .Label = c("TR-1", "TR-2", "TR-3", "TR-4", "TR-5", "TR-6"), class = "factor"), 
    Date = structure(c(4L, 3L, 3L, 3L, 2L, 2L, 2L, 1L), .Label = c("07/01/2018", 
    "08/01/2018", "09/01/2018", "10/01/2018"), class = "factor"), 
    Status = structure(c(2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L), .Label = c("Failed", 
    "Passed"), class = "factor"), Category = structure(c(1L, 
    2L, 3L, 1L, 2L, 3L, 1L, 1L), .Label = c("A", "B", "C"), class = "factor")), .Names = c("ID", 
"Date", "Status", "Category"), class = "data.frame", row.names = c(NA, 
-8L))

That was a tough one:

# I'm converting some variables to factors to get the "order" right and to fill in missing unobserved values later in dcast.
df1$Category <- factor(df1$Category, levels = unique(df1$Category))
date_lvls    <- as.Date(df1$Date, "%Y-%m-%d") %>% unique %>% sort(decreasing = TRUE) %>% as.character
df1$Date     <- factor(df1$Date, date_lvls)

# lets use data.table
library(data.table)
setDT(df1)

# make a lookup table to deal with the duplicated ID issue. Not sure how to do this elegant
tmp <- dcast.data.table(df1, Date ~ ID, fun.aggregate = length)
tmp <- structure(rowSums(tmp[,-1] == 2), .Names = as.character(unlist(tmp[, 1])))

# precaution! Boilerplate incoming in 3, 2, .. 1
dcast.data.table(df1, Date + Category ~ Status, drop = FALSE)[
    ,`:=`(Failed=+!is.na(Failed), Passed=+!is.na(Passed))][
    , c("count","distinct_count") := rowSums(cbind(Failed,Passed))][
    , Category := as.character(Category)][
    , rbind(
        cbind(Category = as.character(Date[1]), count = sum(count), distinct_count = sum(distinct_count) - tmp[as.character(Date[1])], Passed = sum(Passed), Failed = sum(Failed)),
        .SD
       , fill = TRUE), by = Date][
    , Date := NULL ][]

result:

 #     Category count distinct_count Passed Failed
 #1: 2018-01-10     1              1      1      0
 #2:          A     1              1      1      0
 #3:          B     0              0      0      0
 #4:          C     0              0      0      0
 #5: 2018-01-09     3              2      1      2
 #6:          A     1              1      0      1
 #7:          B     1              1      1      0
 #8:          C     1              1      0      1
 #9: 2018-01-08     3              2      1      2
#10:          A     1              1      0      1
#11:          B     1              1      0      1
#12:          C     1              1      1      0
#13: 2018-01-07     1              1      1      0
#14:          A     1              1      1      0
#15:          B     0              0      0      0
#16:          C     0              0      0      0

data:

df1<-
structure(list(ID = c("TR-1", "TR-2", "TR-3", "TR-3", "TR-4", 
"TR-5", "TR-5", "TR-6"), Date = c("2018-01-10", "2018-01-09", 
"2018-01-09", "2018-01-09", "2018-01-08", "2018-01-08", "2018-01-08", 
"2018-01-07"), Status = c("Passed", "Passed", "Failed", "Failed", 
"Failed", "Passed", "Failed", "Passed"), Category = c("A", "B", 
"C", "A", "B", "C", "A", "A")), row.names = c(NA, -8L), class = "data.frame")

please note:

  • please run every line of code one after another. For this you can close every ENDING open bracket and run the line till the end: eg

    1. run : dcast.data.table(df1, Date + Category ~ Status, drop = FALSE)[]

    2. run : dcast.data.table(df1, Date + Category ~ Status, drop = FALSE)[ , := (Failed=+!is.na(Failed), Passed=+!is.na(Passed))][]

    3. ... till the end

    4. if then anything is unclear please ask me about this specific thing.

I'm sure that there must be a more elegant solution, but using tidyverse you can do:

bind_rows(df %>%
           arrange(Date) %>%
           group_by(Date, Category) %>%
           summarise(count = n(),
                     distinct_count = n_distinct(ID),
                     passed = length(Status[Status == "Passed"]),
                     failed = length(Status[Status == "Failed"])) %>% 
           complete(Category) %>% 
           mutate_all(funs(coalesce(., 0L))) %>%
           ungroup() %>%
           mutate(Date = Category,
                  date_id = gl(nrow(.)/3, 3)) %>%
           select(-Category), df %>%
           arrange(Date) %>%
           group_by(Date) %>%
           summarise(count = n(),
                     distinct_count = n_distinct(ID),
                     passed = length(Status[Status == "Passed"]),
                     failed = length(Status[Status == "Failed"])) %>%
           mutate(date_id = gl(nrow(.), 1))) %>%
 arrange(date_id, Date)

   Date       count distinct_count passed failed date_id
   <chr>      <int>          <int>  <int>  <int> <fct>  
 1 07/01/2018     1              1      1      0 1      
 2 A              1              1      1      0 1      
 3 B              0              0      0      0 1      
 4 C              0              0      0      0 1      
 5 08/01/2018     3              2      1      2 2      
 6 A              1              1      0      1 2      
 7 B              1              1      0      1 2      
 8 C              1              1      1      0 2      
 9 09/01/2018     3              2      1      2 3      
10 A              1              1      0      1 3      
11 B              1              1      1      0 3      
12 C              1              1      0      1 3      
13 10/01/2018     1              1      1      0 4      
14 A              1              1      1      0 4      
15 B              0              0      0      0 4      
16 C              0              0      0      0 4 

First, it creates a df with the count, the distinct_count, the passed and the failed column based on "Date" and "Category". Second, by using complete() it generates all levels in "Category" and then coalesce() fill the non-existent levels with 0. Third, it creates a second df with the count, the distinct_count, the passed and the failed column based on just "Date". Finally, it combines the two dfs by rows.

Sample data:

df <- read.table(text = "ID        Date            Status         Category
TR-1      2018-01-10      Passed         A
                 TR-2      2018-01-09      Passed         B
                 TR-3      2018-01-09      Failed         C
                 TR-3      2018-01-09      Failed         A
                 TR-4      2018-01-08      Failed         B
                 TR-5      2018-01-08      Passed         C
                 TR-5      2018-01-08      Failed         A
                 TR-6      2018-01-07      Passed         A", header = TRUE)

Mixing variables such as $Date and $Category in one and the same column is a bad idea because, as noted by @Luminata it makes further processing of the data very difficult.

While it is rather unclear what you want to achieve, and therefore any answer must be tentative, here's a solution that might get you closer to your goal:

If this is your data:

df <- data.frame(
  ID = c("TR-1","TR-2", "TR-3", "TR-3", "TR-4", "TR-5", "TR-5", "TR-6"),       
  Date = c("2018-01-10", "2018-01-09", "2018-01-09", "2018-01-09", "2018-01-08", "2018-01-08", "2018-01-08", "2018-01-07"),            
  Status = c("Passed","Passed","Failed","Failed","Failed","Passed","Failed", "Passed"),         
 Category = c("A","B","C","A","B","C","A","A")
)

and what you want is separate out data by $Date , then why not create a list of separable dataframes for each date using the by and unique functions:

df_list <- by(df, df$Date, function(unique) unique)
df_list
df$Date: 2018-01-07
    ID       Date Status Category
8 TR-6 2018-01-07 Passed        A
------------------------------------------------------------------------------------------ 
df$Date: 2018-01-08
    ID       Date Status Category
5 TR-4 2018-01-08 Failed        B
6 TR-5 2018-01-08 Passed        C
7 TR-5 2018-01-08 Failed        A
------------------------------------------------------------------------------------------ 
df$Date: 2018-01-09
    ID       Date Status Category
2 TR-2 2018-01-09 Passed        B
3 TR-3 2018-01-09 Failed        C
4 TR-3 2018-01-09 Failed        A
------------------------------------------------------------------------------------------ 
df$Date: 2018-01-10
    ID       Date Status Category
1 TR-1 2018-01-10 Passed        A

You could use a mix of lapply on the different levels of the two columns you want to use, mixed with do.call("rbind",x) , to bring that back as an array.

Something like this:

res=do.call("rbind",lapply(levels(DF$Date),function(d)do.call("rbind",lapply(levels(DF$Category),function(c)
                                                                            {
                                                                                tbl=table(DF$Status[DF$Category == c & DF$Date == d])
                                                                                cbind(Date=d,Category=c,count=sum(tbl),distinct_count=sum(tbl>0),t(tbl))
                                                                            }))))
res=as.data.frame(res)

I added a few line to the dataset so the input frame should be:

DF <- read.table(text =
"fD    Date    Status    Category
TR-1    2018-01-10    Passed    A
TR-2    2018-01-09    Passed    B
TR-3    2018-01-09    Failed    C
TR-4    2018-01-09    Failed    A
TR-5    2018-01-08    Failed    B
TR-6    2018-01-08    Passed    C
TR-7    2018-01-08    Failed    A
TR-8    2018-01-08    Passed    B
TR-9    2018-01-08    Failed    A
TR-10    2018-01-08    Failed    A
TR-11    2018-01-07    Passed    A"
, header = TRUE)

The firs line of code will then output:

> res
         Date Category count distinct_count Failed Passed
1  2018-01-07        A     1              1      0      1
2  2018-01-07        B     0              0      0      0
3  2018-01-07        C     0              0      0      0
4  2018-01-08        A     3              1      3      0
5  2018-01-08        B     2              2      1      1
6  2018-01-08        C     1              1      0      1
7  2018-01-09        A     1              1      1      0
8  2018-01-09        B     1              1      0      1
9  2018-01-09        C     1              1      1      0
10 2018-01-10        A     1              1      0      1
11 2018-01-10        B     0              0      0      0
12 2018-01-10        C     0              0      0      0

Edit: I think I finally guessed what you meant by "distinct count" so I update the answer.

As others have noted, mixing your variables in one column may not be the best idea, but I've done it by simply combining the rows afterwards:

 library(tidyr)
 library(dplyr)
 Output <- DF %>%
   group_by(Date, Category) %>%
   summarise('Count'=n(),
             'Distinct_Count'=n_distinct(ID),
             Passed=sum(Status=='Passed'),
             Failed=sum(Status=='Failed')) %>%
   ungroup() %>%
   complete(Date, Category, fill=list(Count=0, Distinct_Count=0, Passed=0, Failed=0))


 perDay <- Output %>% 
   group_by(Date) %>%
   summarise('Count'=sum(Count),
             'Distinct_Count'=sum(Distinct_Count),
             Passed=sum(Passed),
             Failed=sum(Failed)) %>%
   arrange(desc(Date))

 Output$indate <- Output$Date
 Output$Date <- Output$Category
 Combined <- bind_rows(lapply(perDay$Date, function(date) {
   rbind(perDay[perDay$Date==date,], Output[Output$indate==date,c(1,3:6)])
 }))

The data.frames perDay and Output count values for each category (where necessary completing them), only later are the binded together per day.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM