![](/img/trans.png)
[英]R,dplyr: How to replace 0 values based conditional on size of group_by
[英]Conditional increment of an integer based on a condition inside a R dplyr group_by
我有一个数据集,其中活动工作流程每天向满足特定条件的联系人发送电子邮件。
工作流发送 3 个不同的 email 通信 A、B 和 C,间隔天数并基于收件人交互事件。 A和B email通信有两个版本A1,A2和B1,B2。
收件人将来有资格再次进入活动工作流程。 该数据集没有元数据来指示每个recipient_id
的新活动开始。 因此,为了进一步分析数据,我需要开发两个新列来识别recipient_id
组中的新活动,并基于下面描述的一些逻辑:
方法 1 :如果step > lead(step)
标志着活动结束。 或者,如果step < lag(step)
标志着新活动的开始,则将活动计数增加 1。
方法 2 :如果step > lead(step) & lead(date)-date > 14
天标志着活动结束。 或者,如果step < lag(step) & date - lag(date) > 14
天标志着新活动的开始,因此将活动计数增加 1。
这是输入数据集:
structure(list(campaign = c("campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x"), com_elm = c("campaign_x_C3", "campaign_x_B1",
"campaign_x_B2", "campaign_x_C3", "campaign_x_C3", "campaign_x_B1",
"campaign_x_B2", "campaign_x_C3", "campaign_x_C3", "campaign_x_B1",
"campaign_x_B2", "campaign_x_C3", "campaign_x_B1", "campaign_x_C3",
"campaign_x_B1", "campaign_x_A1", "campaign_x_C3", "campaign_x_B1",
"campaign_x_B1", "campaign_x_C3", "campaign_x_B1", "campaign_x_A1",
"campaign_x_C3", "campaign_x_C3", "campaign_x_B1", "campaign_x_B2",
"campaign_x_C3", "campaign_x_B1", "campaign_x_C3", "campaign_x_C3"
), com_elm_id = c(808001L, 811001L, 814001L, 509005L, 729060L,
817002L, 820002L, 792002L, 793003L, 820003L, 824003L, 792002L,
811001L, 787001L, 811001L, 468023L, 792002L, 812001L, 812001L,
808001L, 811001L, 468023L, 468006L, 491014L, 825002L, 828002L,
741001L, 825002L, 512001L, 733001L), recipient_id = c(54L, 54L,
54L, 197L, 197L, 8388L, 8388L, 8426L, 8426L, 10903L, 10903L,
14469L, 14469L, 17466L, 17466L, 17807L, 21666L, 23935L, 24287L,
25412L, 25412L, 31361L, 31361L, 31361L, 31365L, 31365L, 40849L,
40860L, 41737L, 41737L), step = c(3, 1, 2, 3, 3, 1, 2, 3, 3,
1, 2, 3, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 3, 1, 2, 3, 1, 3, 3
), date = structure(c(19029, 19032, 19035, 18778, 18960, 19037,
19040, 19016, 19019, 19040, 19043, 19015, 19032, 19011, 19032,
18746, 19015, 19033, 19033, 19029, 19032, 18746, 18746, 18764,
19044, 19047, 18969, 19044, 18781, 18962), class = "Date")), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -30L), groups = structure(list(
campaign = c("campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x"), recipient_id = c(54L,
197L, 8388L, 8426L, 10903L, 14469L, 17466L, 17807L, 21666L,
23935L, 24287L, 25412L, 31361L, 31365L, 40849L, 40860L, 41737L
), .rows = structure(list(1:3, 4:5, 6:7, 8:9, 10:11, 12:13,
14:15, 16L, 17L, 18L, 19L, 20:21, 22:24, 25:26, 27L,
28L, 29:30), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -17L), .drop = TRUE))
这是预期的 output dataframe:
structure(list(campaign = c("campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x"), com_elm = c("campaign_x_C3", "campaign_x_B1",
"campaign_x_B2", "campaign_x_C3", "campaign_x_C3", "campaign_x_B1",
"campaign_x_B2", "campaign_x_C3", "campaign_x_C3", "campaign_x_B1",
"campaign_x_B2", "campaign_x_C3", "campaign_x_B1", "campaign_x_C3",
"campaign_x_B1", "campaign_x_A1", "campaign_x_C3", "campaign_x_B1",
"campaign_x_B1", "campaign_x_C3", "campaign_x_B1", "campaign_x_A1",
"campaign_x_C3", "campaign_x_C3", "campaign_x_B1", "campaign_x_B2",
"campaign_x_C3", "campaign_x_B1", "campaign_x_C3", "campaign_x_C3"
), com_elm_id = c(808001L, 811001L, 814001L, 509005L, 729060L,
817002L, 820002L, 792002L, 793003L, 820003L, 824003L, 792002L,
811001L, 787001L, 811001L, 468023L, 792002L, 812001L, 812001L,
808001L, 811001L, 468023L, 468006L, 491014L, 825002L, 828002L,
741001L, 825002L, 512001L, 733001L), recipient_id = c(54L, 54L,
54L, 197L, 197L, 8388L, 8388L, 8426L, 8426L, 10903L, 10903L,
14469L, 14469L, 17466L, 17466L, 17807L, 21666L, 23935L, 24287L,
25412L, 25412L, 31361L, 31361L, 31361L, 31365L, 31365L, 40849L,
40860L, 41737L, 41737L), step = c(3, 1, 2, 3, 3, 1, 2, 3, 3,
1, 2, 3, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 3, 1, 2, 3, 1, 3, 3
), date = structure(c(19029, 19032, 19035, 18778, 18960, 19037,
19040, 19016, 19019, 19040, 19043, 19015, 19032, 19011, 19032,
18746, 19015, 19033, 19033, 19029, 19032, 18746, 18746, 18764,
19044, 19047, 18969, 19044, 18781, 18962), class = "Date"), campaign_num_v1 = c(1,
2, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1,
1, 2, 1, 1, 1, 1, 1, 2), campaign_num_v2 = c(1, 1, 1, 1, 2, 1,
1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
1, 1, 2)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -30L), groups = structure(list(campaign = c("campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x"), recipient_id = c(54L, 197L, 8388L, 8426L, 10903L,
14469L, 17466L, 17807L, 21666L, 23935L, 24287L, 25412L, 31361L,
31365L, 40849L, 40860L, 41737L), .rows = structure(list(1:3,
4:5, 6:7, 8:9, 10:11, 12:13, 14:15, 16L, 17L, 18L, 19L, 20:21,
22:24, 25:26, 27L, 28L, 29:30), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -17L), .drop = TRUE))
这是一张图像,显示了 output dataframe 的前 10 个观察值在基于上述方法 1和方法 2添加到原始 dataframe 的新campaign_num_v1
和campaign_num_v2
列时的样子:
我尝试了不同问题中提出的解决方案的多种变体,但无法使其正常工作。 这是我尝试过的最简单的编码版本之一,但不起作用:
dat %>% group_by(campaign, com_elm, recipient_id) %>%
arrange(recipient_id, date) %>%
ungroup() %>%
group_by(campaign, recipient_id ) %>%
mutate(switch = ifelse(is.na(step > lead(step)), FALSE,step > lead(step))) %>%
mutate(campaign_num_v1 = cumsum(step>lead(step)),
campaign_num_v2 = cumsum(switch))
使用 dplyr 的解决方案是理想的,但如果别无选择,我不介意采用其他方法。 谢谢。
您可以像这样将lag()
与cumsum()
) 一起使用,并使用coalesce()
处理领先的 NA,这要归功于 Martin 的有益评论。
df %>% arrange(campaign, recipient_id,date) %>%
group_by(campaign,recipient_id) %>%
mutate(campaign_num1 = cumsum(coalesce(lag(step)>=step, TRUE)),
campaign_num2 = cumsum(coalesce(lag(step)>=step & date-lag(date)>14, TRUE))
)
Output:(前十行)
# A tibble: 30 x 8
# Groups: campaign, recipient_id [17]
campaign com_elm com_elm_id recipient_id step date campaign_num1 campaign_num2
<chr> <chr> <int> <int> <dbl> <date> <int> <int>
1 campaign_x campaign_x_C3 808001 54 3 2022-02-06 1 1
2 campaign_x campaign_x_B1 811001 54 1 2022-02-09 2 1
3 campaign_x campaign_x_B2 814001 54 2 2022-02-12 2 1
4 campaign_x campaign_x_C3 509005 197 3 2021-05-31 1 1
5 campaign_x campaign_x_C3 729060 197 3 2021-11-29 2 2
6 campaign_x campaign_x_B1 817002 8388 1 2022-02-14 1 1
7 campaign_x campaign_x_B2 820002 8388 2 2022-02-17 1 1
8 campaign_x campaign_x_C3 792002 8426 3 2022-01-24 1 1
9 campaign_x campaign_x_C3 793003 8426 3 2022-01-27 2 1
10 campaign_x campaign_x_B1 820003 10903 1 2022-02-17 1 1
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.