首页 > 解决方案 > dplyr:从子字符串的多个区域/字符变异

问题描述

这里是初学者。尝试使用 dplyr: mutate/case_when 具有多个子字符串条件(10 个字符的字符串)。每个字符代表一个种族。例如,字符串的 1-3 个位置中的任何一个位置为“Y”,其余位置为“N”的字符串将被定义为“Latino”。我正在尝试获取一个字符串的正确代码,该字符串在 1-3 个位置中的任何一个位置都有“Y”,但在 4-5 个位置(“亚洲”)中的任何一个都有一个“Y”。我想将其定义为“多民族”。希望提供正确的代码来为具有“多民族”结果的字符串创建?非常感谢这个网站!

library(dplyr)

data = data.frame(APP_AC = c("YNNNNNNNNN",
                             "YYNNNNNNNN",
                             "YYYNNNNNNN",
                             "YNYNNNNNNN",
                             "NNNYNNNNNN",
                             "YNNYNNNNNN",
                             "NNNNNYNNNN",
                             "YNNNNYNNNY",
                             "NNNNNNNNNN"))

data %>% 
  mutate(ETHNICITY = case_when(
    str_sub(APP_AC,1,1) == "Y" ~ "Latino", 
    str_sub(APP_AC,2,2) == "Y" ~ "Latino",
    str_sub(APP_AC,3,3) == "Y" ~ "Latino",
    str_sub(APP_AC,4,4) == "Y" ~ "Asian",
    str_sub(APP_AC,5,5) == "Y" ~ "Asian",
    str_sub(APP_AC,6,6) == "Y" ~ "Black",
    str_sub(APP_AC,7,7) == "Y" ~ "Native_American_Alaskan",
    str_sub(APP_AC,8,8) == "Y" ~ "Pacific_Islander",
    str_sub(APP_AC,9,9) == "Y" ~ "Pacific_Islander",
    str_sub(APP_AC,10,10) == "Y" ~ "White",
    TRUE ~ "Unknown"))

    APP_AC     ETHNICITY
1   YNNNNNNNNN Latino
2   YYNNNNNNNN Latino
3   YYYNNNNNNN Latino
4   YNYNNNNNNN Latino
5   NNNYNNNNNN Asian
6   YNNYNNNNNN Asian
7   NNNNNYNNNN Black
8   YNNNNYNNNY Latino
9   NNNNNNNNNN Unknown

期望的输出:

    APP_AC     ETHNICITY
1   YNNNNNNNNN Latino
2   YYNNNNNNNN Latino
3   YYYNNNNNNN Latino
4   YNYNNNNNNN Latino
5   NNNYNNNNNN Asian
6   YNNYNNNNNN Multi-Ethnic
7   NNNNNYNNNN Black
8   YNNNNYNNNY Multi-Ethnic
9   NNNNNNNNNN Unknown

标签: rdplyrsubstrcase-when

解决方案


您可以使用str_detect

library(dplyr)
library(stringr)

data %>% 
  mutate(ETHNICITY = case_when(
    str_count(APP_AC, 'Y') > 1 ~ "Multi-Ethnic",
    str_sub(APP_AC,1,1) == "Y" ~ "Latino", 
    str_sub(APP_AC,2,2) == "Y" ~ "Latino",
    str_sub(APP_AC,3,3) == "Y" ~ "Latino",
    str_sub(APP_AC,4,4) == "Y" ~ "Asian",
    str_sub(APP_AC,5,5) == "Y" ~ "Asian",
    str_sub(APP_AC,6,6) == "Y" ~ "Black",
    str_sub(APP_AC,7,7) == "Y" ~ "Native_American_Alaskan",
    str_sub(APP_AC,8,8) == "Y" ~ "Pacific_Islander",
    str_sub(APP_AC,9,9) == "Y" ~ "Pacific_Islander",
    str_sub(APP_AC,10,10) == "Y" ~ "White",
    TRUE ~ "Unknown"))

#      APP_AC    ETHNICITY
#1 YNNNNNNNNN       Latino
#2 NNNYNNNNNN        Asian
#3 YNNYNNNNNN Multi-Ethnic
#4 NNNNNYNNNN        Black
#5 NNNNNNNNNN      Unknown

同样,您也可以将其他条件合二为一,使代码更短。

data %>% 
  mutate(ETHNICITY = case_when(
    str_count(APP_AC, 'Y') > 1 ~ "Multi-Ethnic",
    str_detect(str_sub(APP_AC, 1, 3), 'Y') ~ "Latino",
    str_detect(str_sub(APP_AC, 4, 5), 'Y') ~ "Asian",
    str_sub(APP_AC,6,6) == "Y" ~ "Black",
    str_sub(APP_AC,7,7) == "Y" ~ "Native_American_Alaskan",
    str_detect(str_sub(APP_AC, 8, 9), 'Y') ~ "Pacific_Islander",
    str_sub(APP_AC,10,10) == "Y" ~ "White",
    TRUE ~ "Unknown"))

推荐阅读