首页 > 解决方案 > 如何使用 R 根据长格式数据的列中第一个零的位置创建新的分类变量?

问题描述

数据集以长格式布局,没有缺失值,有 5 列,第 1 列是“id”,第 2 列是“gender”,第 3 列是二进制变量“competency”,第 4 个变量是薪水,第 5 列是时间。

现在,我想根据以下规则通过id创建一个新的分类变量“yeel”(年终评估级别“),

1-如果每个id内“competency”变量的所有值都等于1,那么“yeel”的值应该是“unqualfied”;

2-如果变量“competency”中第一个零的位置落入“time”变量的范围[1,5],那么“yeel”的值应该是“entry level”;

3-如果变量“competency”中第一个零的位置落入“time”变量的范围[6,8],那么“yeel”的值应该是“intermediate level”;

4-如果变量“competency”中第一个零的位置落入“time”变量的范围[9,12],那么“yeel”的值应该是“senior level”;

5-如果每个id内“competency”变量的所有值都等于0,那么“yeel”的值应该是“undefined”;

例如,预期的数据集应如下所示

structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 
3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 
5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 
7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 
10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 
11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 
13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 
14, 14, 14, 14, 14), gender = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), competency = c(1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 
1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), salary = c(15000, 
15500, 15800, 16000, 16200, 16400, 16600, 16800, 17000, 17300, 
17500, 17500, 15000, 16000, 17000, 18000, 19000, 20000, 21000, 
22000, 23000, 24000, 25000, 26000, 15000, 16000, 18000, 20000, 
22000, 24000, 26000, 28000, 30000, 32000, 34000, 36000, 15000, 
16000, 17000, 17500, 18000, 18500, 19000, 19500, 20000, 20500, 
21000, 21500, 15000, 15300, 15600, 15900, 16200, 16500, 16800, 
17100, 17400, 17700, 18000, 18300, 15000, 15500, 15800, 16000, 
16200, 16400, 16600, 16800, 17000, 17300, 17500, 17500, 15000, 
16000, 17000, 18000, 19000, 20000, 21000, 22000, 23000, 24000, 
25000, 26000, 15000, 16000, 18000, 20000, 22000, 24000, 26000, 
28000, 30000, 32000, 34000, 36000, 15000, 16000, 17000, 17500, 
18000, 18500, 19000, 19500, 20000, 20500, 21000, 21500, 15000, 
15300, 15600, 15900, 16200, 16500, 16800, 17100, 17400, 17700, 
18000, 18300, 15000, 15500, 15800, 16000, 16200, 16400, 16600, 
16800, 17000, 17300, 17500, 17500, 15000, 16000, 17000, 18000, 
19000, 20000, 21000, 22000, 23000, 24000, 25000, 26000, 15000, 
16000, 18000, 20000, 22000, 24000, 26000, 28000, 30000, 32000, 
34000, 36000, 36000, 36000, 36000, 36000, 36000, 36000, 36000, 
36000, 36000, 36000, 36000, 36000), time = c(1, 2, 3, 4, 5, 6, 
7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 
10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 
5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 
7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 
10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 
5, 6, 7, 8, 9, 10, 11, 12), yeel = c("unqualified", "unqualified", 
"unqualified", "unqualified", "unqualified", "unqualified", "unqualified", 
"unqualified", "unqualified", "unqualified", "unqualified", "unqualified", 
"entry level", "entry level", "entry level", "entry level", "entry level", 
"entry level", "entry level", "entry level", "entry level", "entry level", 
"entry level", "entry level", "entry level", "entry level", "entry level", 
"entry level", "entry level", "entry level", "entry level", "entry level", 
"entry level", "entry level", "entry level", "entry level", "entry level", 
"entry level", "entry level", "entry level", "entry level", "entry level", 
"entry level", "entry level", "entry level", "entry level", "entry level", 
"entry level", "entry level", "entry level", "entry level", "entry level", 
"entry level", "intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "intermediate level", "intermediate level", 
"intermediate level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "senior level", "senior level", "senior level", 
"senior level", "undefined", "undefined", "undefined", "undefined", 
"undefined", "undefined", "undefined", "undefined", "undefined", 
"undefined", "undefined", "undefined")), row.names = c(NA, 168L
), class = "data.frame")->df_new

原始数据集的结构如下,

structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 
3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 
5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 
7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 
10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 
11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 
13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 
14, 14, 14, 14, 14), gender = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), competency = c(1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 
1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), salary = c(15000, 
15500, 15800, 16000, 16200, 16400, 16600, 16800, 17000, 17300, 
17500, 17500, 15000, 16000, 17000, 18000, 19000, 20000, 21000, 
22000, 23000, 24000, 25000, 26000, 15000, 16000, 18000, 20000, 
22000, 24000, 26000, 28000, 30000, 32000, 34000, 36000, 15000, 
16000, 17000, 17500, 18000, 18500, 19000, 19500, 20000, 20500, 
21000, 21500, 15000, 15300, 15600, 15900, 16200, 16500, 16800, 
17100, 17400, 17700, 18000, 18300, 15000, 15500, 15800, 16000, 
16200, 16400, 16600, 16800, 17000, 17300, 17500, 17500, 15000, 
16000, 17000, 18000, 19000, 20000, 21000, 22000, 23000, 24000, 
25000, 26000, 15000, 16000, 18000, 20000, 22000, 24000, 26000, 
28000, 30000, 32000, 34000, 36000, 15000, 16000, 17000, 17500, 
18000, 18500, 19000, 19500, 20000, 20500, 21000, 21500, 15000, 
15300, 15600, 15900, 16200, 16500, 16800, 17100, 17400, 17700, 
18000, 18300, 15000, 15500, 15800, 16000, 16200, 16400, 16600, 
16800, 17000, 17300, 17500, 17500, 15000, 16000, 17000, 18000, 
19000, 20000, 21000, 22000, 23000, 24000, 25000, 26000, 15000, 
16000, 18000, 20000, 22000, 24000, 26000, 28000, 30000, 32000, 
34000, 36000, 36000, 36000, 36000, 36000, 36000, 36000, 36000, 
36000, 36000, 36000, 36000, 36000), time = c(1, 2, 3, 4, 5, 6, 
7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 
10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 
5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 
7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 
10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 
5, 6, 7, 8, 9, 10, 11, 12)), row.names = c(NA, 168L), class = "data.frame")->df

Thank you for your help!

标签: r

解决方案


您可以利用case_when为每个 传递多个条件id

library(dplyr)

df %>%
  group_by(id) %>%
  mutate(first_zero_loc = time[match(0, competency)], 
          yeel = case_when(all(competency == 1) ~ "unqualfied", 
                           all(competency == 0) ~ "undefined",
                           between(first_zero_loc, 1, 5) ~ "entry level", 
                           between(first_zero_loc, 6, 8) ~ "intermediate level",
                           between(first_zero_loc, 9, 12) ~ "competency level",
                          ))  %>%
  ungroup %>%
  select(-first_zero_loc)

推荐阅读