I have a dataframe df
which contains per user responses as below:
userID pred_task obs1_task obs2_task exp1_task exp2_task postPOE_task
3108 H E E E M M
3207 H H E NA H M
3350 M H H NA H H
3961 E E E E E M
4021 H H E M H E
With some pre-processing I have been able to have additional features which contain the counts of:
E
= Number of times E was reported by a participant,
M
= Number of times M was reported by a participant,
H
= Number of times H was reported by a participant,
Moreover, I have also counted the bigram transitions (the counts of two consecutive sequences) where:
EE
--> Transition from E to E ,
EM
--> Transition from E to M ,
EH
--> Transition from E to H ,
ME
--> Transition from M to E ,
MM
--> Transition from M to M ,
MH
--> Transition from M to H ,
HE
--> Transition from H to E ,
HM
--> Transition from H to M ,
HH
--> Transition from H to H ,
The updated df
is as below:
userID pred_tsk obs1_tsk obs2_tsk exp1_tsk exp2_tsk postPOE_tsk E M H EE EM EH MM ME MH HH HE HM
3108 H E E E M M 3 2 1 2 1 0 1 0 0 0 0 0
3207 H H E NA H M 1 1 3 0 0 0 0 0 0 1 1 1
3350 M H H NA H H 0 1 4 0 0 0 0 0 1 2 0 0
3961 E E E E E M 5 1 0 4 1 0 0 0 0 0 0 0
4021 H H E M H E 2 1 3 0 1 0 0 0 1 1 2 0
Please note that:
. Users could report on 6 different tasks.
. E
, M
and H
are the counts of how many times the users reported a task to be Hard
, Mixed
or Easy
. The sum of these counts can (at most) be 6.
. EE
, EM
, EH
, MM
, ME
, MH
, HH
, HE
, HM
are the transitions of reported responses. Such as on one task a user reported E
and then on the other task they reported M
so it is the transition EM
. Conversely, for the other transitions
Question:
I am interested in calculating different frequencies P(next)
, P(prev)
and P(next|prev)
for each of the reported states ie,
P(next_E)
, P(next_M)
, P(next_H)
P(prev_E)
, P(prev_M)
, P(prev_H)
Where the formulas or the description of probabilities is as below:
P(next)
= Proportion of times a state occurred as a next state
P(prev)
= Proportion of times a state occurred as a previous state
P(next|prev)
= count(prev -> next) / count(prev)
I understand the question is a bit long, I thank you for reading this far and would appreciate any tips on this.
The following post is somewhat related How to calculate the probability from frequencies
dput(df)
structure(list(userID = structure(c(2L, 2L, 3L, 1L, 2L), .Label = c("E","H","M"), class = "factor"),
pred_task = structure(c(1L, 2L, 2L, 1L, 2L), .Label = c(" E", " H"), class = "factor"),
obs1_task = structure(c(1L, 1L, 2L, 1L, 1L), .Label = c(" E", " H"), class = "factor"),
obs2_task = structure(c(1L, 3L, 3L, 1L, 2L), .Label = c(" E", " M", " NA"), class = "factor"),
exp1_task = structure(c(3L, 2L, 2L, 1L, 2L), .Label = c("E", "H", "M"), class = "factor"),
exp2_task = structure(c(4L, 4L, 3L, 1L, 2L), .Label = c("", "E", "H", "M"), class = "factor"),
postPOE_task = structure(c(4L, 2L, 1L, 5L, 3L), .Label = c("0", "1", "2", "3", "M"), class = "factor"),
E = c(2L, 1L, 1L, 5L, 1L),
M = c(1L, 3L, 4L, 1L, 3L),
H = c(2L, 0L, 0L, 0L, 0L),
EE = c(1L, 0L, 0L, 4L, 1L),
EM = c(0L, 0L, 0L, 1L, 0L),
EH = c(1L, 0L, 0L, 0L, 0L),
MM = c(0L, 0L, 0L, 0L, 0L),
ME = c(0L, 0L, 1L, 0L, 1L),
MH = c(0L, 1L, 2L, 0L, 1L),
HH = c(0L, 1L, 0L, 0L, 2L),
HE = c(0L, 1L, 0L, 0L, 0L),
HM = c(NA, NA, NA, 0L, NA)),
class = "data.frame", row.names = c("3108", "3207", "3350", "3961", "4021"))
Here's a function that will calculate the proportions you need, even with a much larger dataframe. Just be sure that the input dataframe has the exact structure as your example df
.
Also, I think I found some inconsistencies in your dput(df)
relative to your updated df
. I fixed the dput(df)
to reflect the values in your example df
.
# "fixed" df to reflect example
structure(list(userID = structure(c(2L, 2L, 3L, 1L, 2L), .Label = c("E","H","M"), class = "factor"),
pred_task = structure(c(1L, 2L, 2L, 1L, 2L), .Label = c(" E", " H"), class = "factor"),
obs1_task = structure(c(1L, 1L, 2L, 1L, 1L), .Label = c(" E", " H"), class = "factor"),
obs2_task = structure(c(1L, 3L, 3L, 1L, 2L), .Label = c(" E", " M", " NA"), class = "factor"),
exp1_task = structure(c(3L, 2L, 2L, 1L, 2L), .Label = c("E", "H", "M"), class = "factor"),
exp2_task = structure(c(4L, 4L, 3L, 1L, 2L), .Label = c("", "E", "H", "M"), class = "factor"),
postPOE_task = structure(c(4L, 2L, 1L, 5L, 3L), .Label = c("0", "1", "2", "3", "M"), class = "factor"),
E = c(3L, 1L, 0L, 5L, 2L),
M = c(2L, 1L, 1L, 1L, 1L),
H = c(1L, 3L, 4L, 0L, 3L),
EE = c(2L, 0L, 0L, 4L, 0L),
EM = c(1L, 0L, 0L, 1L, 1L),
EH = c(0L, 0L, 0L, 0L, 0L),
MM = c(1L, 0L, 0L, 0L, 0L),
ME = c(0L, 0L, 0L, 0L, 0L),
MH = c(0L, 0L, 1L, 0L, 1L),
HH = c(0L, 1L, 2L, 0L, 1L),
HE = c(0L, 1L, 0L, 0L, 2L),
HM = c(0L, 1L, 0L, 0L, 0L)),
class = "data.frame", row.names = c("3108", "3207", "3350", "3961", "4021"))
The function:
transition.probs <- function(df) {
require(dplyr)
df.states <- df[, c(11:19)]
state.factors <- colnames(df.states)
state.total <- sum(df.states, na.rm = TRUE)
state.sums <- colSums(df.states, na.rm = TRUE)
state.df <-data.frame(sums = state.sums, id = as.character(state.factors))
#-------------------------------------------------------------------------------
state.df[grepl(".E", state.df$id), ] %>%
.[, 1] %>%
sum() -> next.E.count
state.df[grepl("E.", state.df$id), ] %>%
.[, 1] %>%
sum() -> prev.E.count
state.df[grepl(".M", state.df$id), ] %>%
.[, 1] %>%
sum() -> next.M.count
state.df[grepl("M.", state.df$id), ] %>%
.[, 1] %>%
sum() -> prev.M.count
state.df[grepl(".H", state.df$id), ] %>%
.[, 1] %>%
sum() -> next.H.count
state.df[grepl("H.", state.df$id), ] %>%
.[, 1] %>%
sum() -> prev.H.count
#-------------------------------------------------------------------------------
next.E.p <- next.E.count / state.total
prev.E.p <- prev.E.count / state.total
next.M.p <- next.M.count / state.total
prev.M.p <- prev.M.count / state.total
next.H.p <- next.H.count / state.total
prev.H.p <- prev.H.count / state.total
#-------------------------------------------------------------------------------
state.df[grepl("EE", state.df$id), ] %>%
.[, 1] -> EE.count
state.df[grepl("EM", state.df$id), ] %>%
.[, 1] -> EM.count
state.df[grepl("EH", state.df$id), ] %>%
.[, 1] -> EH.count
state.df[grepl("MM", state.df$id), ] %>%
.[, 1] -> MM.count
state.df[grepl("ME", state.df$id), ] %>%
.[, 1] -> ME.count
state.df[grepl("MH", state.df$id), ] %>%
.[, 1] -> MH.count
state.df[grepl("HH", state.df$id), ] %>%
.[, 1] -> HH.count
state.df[grepl("HE", state.df$id), ] %>%
.[, 1] -> HE.count
state.df[grepl("HM", state.df$id), ] %>%
.[, 1] -> HM.count
#-------------------------------------------------------------------------------
EE.E <- EE.count / prev.E.count
EM.E <- EM.count / prev.E.count
EH.E <- EH.count / prev.E.count
MM.M <- MM.count / prev.M.count
ME.M <- ME.count / prev.M.count
MH.M <- MH.count / prev.M.count
HH.H <- HH.count / prev.H.count
HE.H <- HE.count / prev.H.count
HM.H <- HM.count / prev.H.count
#-------------------------------------------------------------------------------
state.summary <- data.frame(trans.state = as.factor(c(
"next.E",
"prev.E",
"next.M",
"prev.M",
"next.H",
"prev.H",
"EE.E",
"EM.M",
"EH.H",
"MM.M",
"ME.M",
"MH.M",
"HH.H",
"HE.H",
"HM.H")),
p = as.numeric(c(
next.E.p,
prev.E.p,
next.M.p,
prev.M.p,
next.H.p,
prev.H.p,
EE.E,
EM.E,
EH.E,
MM.M,
ME.M,
MH.M,
HH.H,
HE.H,
HM.H)))
state.summary
}
The output dataframe:
transition.probs(df)
trans.state p
1 next.E 0.473684
2 prev.E 0.473684
3 next.M 0.263158
4 prev.M 0.157895
5 next.H 0.315789
6 prev.H 0.421053
7 EE.E 0.666667
8 EM.M 0.333333
9 EH.H 0.000000
10 MM.M 0.333333
11 ME.M 0.000000
12 MH.M 0.666667
13 HH.H 0.500000
14 HE.H 0.375000
15 HM.H 0.125000
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.