I'm trying to summarise a dataframe using two variables - I basically want to break down variable 1 by variable 2 in order to plot the results in a 100% stacked bar chart.
I have multiple columns of type logical, which can be split between two main categories that will be used to create the breakdown.
I have tried to use gather
from dplyr
to transform the dataframe to longform, however the output is not what I expect.
topics_by_variable <- function (dataset, variable_1, variable_2) {
#select variables columns
variable_1_columns <- dataset[, data.table::`%like%`(names(dataset), variable_1)]
variable_2_columns <- dataset[, data.table::`%like%`(names(dataset), variable_2)]
#create new dataframe including only relevant columns
df <- cbind(variable_1_columns, variable_2_columns)
#transform df to long form
new_df <- tidyr::gather(df, variable_2, count, names(variable_2_columns[1]):names(variable_2_columns)[length(names(variable_2_columns))], factor_key=FALSE)
#count topics
topic_count <- function (x) {
t <- sum(x == TRUE)
}
#group by variable 2 and count
new_df <- new_df %>%
dplyr::group_by(variable_2) %>%
dplyr::summarise_at(topic_names, .funs = topic_count)
#transform new_df to longform
final_df <- tidyr::gather(new_df, topic, volume, names(variable_1_columns[1]):names(variable_1_columns)[length(names(variable_1_columns))], factor_key=FALSE)
final_df <- data.frame(final_df)
Here is the dataset I'm using:
structure(list(topic_su = c("TRUE", "TRUE", "TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "TRUE", "TRUE", "TRUE", "TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE"), topic_so = c("FALSE",
"FALSE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE",
"TRUE", "TRUE", "FALSE", "FALSE", "FALSE", "FALSE"), topic_cl = c("FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE"
), topic_in = c("FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "TRUE", "TRUE", "TRUE", "TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE",
"TRUE", "TRUE", "TRUE"), topic_qu = c("FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE"), topic_re = c("FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE"), brands_ne = c("TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE"
), brands_st = c("FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE"), brands_co = c("FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "TRUE", "TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE"
), brands_seg = c("FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "TRUE", "TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE"), brands_sen = c("TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE",
"TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE"), brands_ta = c("FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "TRUE"), brands_tc = c("FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", "FALSE",
"FALSE", "FALSE")), class = "data.frame", row.names = c(NA, -39L
))
The desired output would be the following, however when I use gather the volume figure is the total number of rows and is repeated across all brands.
variable_2 topic volume
<chr> <chr> <int>
1 brands_co topic_su 10
2 brands_ne topic_su 17
3 brands_seg topic_su 10
4 brands_sen topic_su 18
5 brands_st topic_su 0
6 brands_ta topic_su 1
7 brands_tc topic_su 0
8 brands_co topic_so 22
9 brands_ne topic_so 17
10 brands_seg topic_so 11
11 brands_sen topic_so 23
12 brands_st topic_so 0
13 brands_ta topic_so 0
14 brands_tc topic_so 0
Assuming that your dataset is dt
you can do something like this:
library(dplyr)
expand.grid(brand = names(dt)[grepl("brands", names(dt))],
topic = names(dt)[grepl("topic", names(dt))],
stringsAsFactors = F) %>%
rowwise() %>%
mutate(volume = sum(dt[brand] == "TRUE" & dt[topic] == "TRUE")) %>%
ungroup()
# # A tibble: 42 x 3
# brand topic volume
# <chr> <chr> <int>
# 1 brands_ne topic_su 17
# 2 brands_st topic_su 0
# 3 brands_co topic_su 10
# 4 brands_seg topic_su 10
# 5 brands_sen topic_su 18
# 6 brands_ta topic_su 1
# 7 brands_tc topic_su 0
# 8 brands_ne topic_so 17
# 9 brands_st topic_so 0
#10 brands_co topic_so 22
# # ... with 32 more rows
The process does the following:
You get all column names (from original dataset) that match "brands" and "topic" and create all possible combinations between them.
For each combination, you get the corresponding columns of your original dataset and count how many times they are both TRUE.
An alternative could be to use a vectorised function instead of rowwise
, which might be faster:
# vectorised function
GetVolume = function(x,y) sum(dt[x] == "TRUE" & dt[y] == "TRUE")
GetVolume = Vectorize(GetVolume)
expand.grid(brand = names(dt)[grepl("brands", names(dt))],
topic = names(dt)[grepl("topic", names(dt))],
stringsAsFactors = F) %>%
mutate(volume = GetVolume(brand, topic))
Another tidyverse solution:
library(tidyverse)
## data
head(df)
#> topic_su topic_so topic_cl topic_in topic_qu topic_re brands_ne
#> 1 TRUE FALSE FALSE FALSE FALSE FALSE TRUE
#> 2 TRUE FALSE FALSE FALSE FALSE FALSE TRUE
#> 3 TRUE TRUE FALSE FALSE FALSE FALSE TRUE
#> 4 TRUE TRUE FALSE FALSE FALSE FALSE TRUE
#> 5 TRUE TRUE FALSE FALSE FALSE FALSE TRUE
#> 6 TRUE TRUE FALSE FALSE FALSE FALSE TRUE
#> brands_st brands_co brands_seg brands_sen brands_ta brands_tc
#> 1 FALSE FALSE FALSE TRUE FALSE FALSE
#> 2 FALSE FALSE FALSE TRUE FALSE FALSE
#> 3 FALSE FALSE FALSE TRUE FALSE FALSE
#> 4 FALSE FALSE FALSE TRUE FALSE FALSE
#> 5 FALSE FALSE FALSE TRUE FALSE FALSE
#> 6 FALSE FALSE FALSE TRUE FALSE FALSE
mutate_all(df, as.logical) %>%
gather(key = "topic", value = "topic_value", starts_with("topic")) %>%
gather(key = "variable_2", value = "variable_2_value", -starts_with("topic")) %>%
group_by(topic, variable_2) %>%
summarize(volume = sum(topic_value & variable_2_value))
#> # A tibble: 42 x 3
#> # Groups: topic [6]
#> topic variable_2 volume
#> <chr> <chr> <int>
#> 1 topic_cl brands_co 22
#> 2 topic_cl brands_ne 16
#> 3 topic_cl brands_seg 15
#> 4 topic_cl brands_sen 15
#> 5 topic_cl brands_st 0
#> 6 topic_cl brands_ta 1
#> 7 topic_cl brands_tc 0
#> 8 topic_in brands_co 23
#> 9 topic_in brands_ne 16
#> 10 topic_in brands_seg 15
#> # … with 32 more rows
Created on 2019-06-24 by the reprex package (v0.3.0)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.