繁体   English   中英

如何使用 R 根据长格式数据中列中第一个零的位置创建新的分类变量?

[英]How to create a new categorical variable based on the location of first zero in a column in a long format data using R?

数据集以长格式布局,没有缺失值,有 5 列,第 1 列是“id”,第 2 列是“gender”,第 3 列是二进制变量“competency”,第 4 个变量是薪水,第 5 列是时间。

现在,我想根据以下规则通过id创建一个新的分类变量“yeel”(年终评估级别),

1-如果每个id内的“competency”变量的值都等于1,那么“yeel”的值应该是“unqualified”;

2-如果变量“competency”中第一个零的位置落在“time”变量的[1,5]范围内,则“yeel”的值应该是“entry level”;

3-如果变量“competency”中第一个零的位置落在“time”变量的[6,8]范围内,则“yeel”的值应该是“intermediate level”;

4-如果变量“competency”中第一个零的位置落在“time”变量的[9,12]范围内,则“yeel”的值应该是“senior level”;

5-如果每个id中“competency”变量的所有值都等于0,那么“yeel”的值应该是“undefined”;

例如,预期的数据集应该如下所示

structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 
3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 
5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 
7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 
10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 
11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 
13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 
14, 14, 14, 14, 14), gender = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), competency = c(1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 
1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), salary = c(15000, 
15500, 15800, 16000, 16200, 16400, 16600, 16800, 17000, 17300, 
17500, 17500, 15000, 16000, 17000, 18000, 19000, 20000, 21000, 
22000, 23000, 24000, 25000, 26000, 15000, 16000, 18000, 20000, 
22000, 24000, 26000, 28000, 30000, 32000, 34000, 36000, 15000, 
16000, 17000, 17500, 18000, 18500, 19000, 19500, 20000, 20500, 
21000, 21500, 15000, 15300, 15600, 15900, 16200, 16500, 16800, 
17100, 17400, 17700, 18000, 18300, 15000, 15500, 15800, 16000, 
16200, 16400, 16600, 16800, 17000, 17300, 17500, 17500, 15000, 
16000, 17000, 18000, 19000, 20000, 21000, 22000, 23000, 24000, 
25000, 26000, 15000, 16000, 18000, 20000, 22000, 24000, 26000, 
28000, 30000, 32000, 34000, 36000, 15000, 16000, 17000, 17500, 
18000, 18500, 19000, 19500, 20000, 20500, 21000, 21500, 15000, 
15300, 15600, 15900, 16200, 16500, 16800, 17100, 17400, 17700, 
18000, 18300, 15000, 15500, 15800, 16000, 16200, 16400, 16600, 
16800, 17000, 17300, 17500, 17500, 15000, 16000, 17000, 18000, 
19000, 20000, 21000, 22000, 23000, 24000, 25000, 26000, 15000, 
16000, 18000, 20000, 22000, 24000, 26000, 28000, 30000, 32000, 
34000, 36000, 36000, 36000, 36000, 36000, 36000, 36000, 36000, 
36000, 36000, 36000, 36000, 36000), time = c(1, 2, 3, 4, 5, 6, 
7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 
10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 
5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 
7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 
10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 
5, 6, 7, 8, 9, 10, 11, 12), yeel = c("unqualified", "unqualified", 
"unqualified", "unqualified", "unqualified", "unqualified", "unqualified", 
"unqualified", "unqualified", "unqualified", "unqualified", "unqualified", 
"entry level", "entry level", "entry level", "entry level", "entry level", 
"entry level", "entry level", "entry level", "entry level", "entry level", 
"entry level", "entry level", "entry level", "entry level", "entry level", 
"entry level", "entry level", "entry level", "entry level", "entry level", 
"entry level", "entry level", "entry level", "entry level", "entry level", 
"entry level", "entry level", "entry level", "entry level", "entry level", 
"entry level", "entry level", "entry level", "entry level", "entry level", 
"entry level", "entry level", "entry level", "entry level", "entry level", 
"entry level", "intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "undefined", "undefined", "undefined", "undefined", 
"undefined", "undefined", "undefined", "undefined", "undefined", 
"undefined", "undefined", "undefined")), row.names = c(NA, 168L
), class = "data.frame")->df_new

原始数据集结构如下,

structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 
3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 
5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 
7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 
10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 
11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 
13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 
14, 14, 14, 14, 14), gender = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), competency = c(1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 
1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), salary = c(15000, 
15500, 15800, 16000, 16200, 16400, 16600, 16800, 17000, 17300, 
17500, 17500, 15000, 16000, 17000, 18000, 19000, 20000, 21000, 
22000, 23000, 24000, 25000, 26000, 15000, 16000, 18000, 20000, 
22000, 24000, 26000, 28000, 30000, 32000, 34000, 36000, 15000, 
16000, 17000, 17500, 18000, 18500, 19000, 19500, 20000, 20500, 
21000, 21500, 15000, 15300, 15600, 15900, 16200, 16500, 16800, 
17100, 17400, 17700, 18000, 18300, 15000, 15500, 15800, 16000, 
16200, 16400, 16600, 16800, 17000, 17300, 17500, 17500, 15000, 
16000, 17000, 18000, 19000, 20000, 21000, 22000, 23000, 24000, 
25000, 26000, 15000, 16000, 18000, 20000, 22000, 24000, 26000, 
28000, 30000, 32000, 34000, 36000, 15000, 16000, 17000, 17500, 
18000, 18500, 19000, 19500, 20000, 20500, 21000, 21500, 15000, 
15300, 15600, 15900, 16200, 16500, 16800, 17100, 17400, 17700, 
18000, 18300, 15000, 15500, 15800, 16000, 16200, 16400, 16600, 
16800, 17000, 17300, 17500, 17500, 15000, 16000, 17000, 18000, 
19000, 20000, 21000, 22000, 23000, 24000, 25000, 26000, 15000, 
16000, 18000, 20000, 22000, 24000, 26000, 28000, 30000, 32000, 
34000, 36000, 36000, 36000, 36000, 36000, 36000, 36000, 36000, 
36000, 36000, 36000, 36000, 36000), time = c(1, 2, 3, 4, 5, 6, 
7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 
10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 
5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 
7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 
10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 
5, 6, 7, 8, 9, 10, 11, 12)), row.names = c(NA, 168L), class = "data.frame")->df

Thank you for your help!

您可以使用case_when为每个id传递多个条件。

library(dplyr)

df %>%
  group_by(id) %>%
  mutate(first_zero_loc = time[match(0, competency)], 
          yeel = case_when(all(competency == 1) ~ "unqualfied", 
                           all(competency == 0) ~ "undefined",
                           between(first_zero_loc, 1, 5) ~ "entry level", 
                           between(first_zero_loc, 6, 8) ~ "intermediate level",
                           between(first_zero_loc, 9, 12) ~ "competency level",
                          ))  %>%
  ungroup %>%
  select(-first_zero_loc)

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM