首页 > 解决方案 > 如何仅将所有合并症字符串转移到新行中

问题描述

我正在尝试获取一个仅包含合并症字符串和无类别的新列。这是在 R 中完成的,具有 tidyverse 的偏好。您会看到,其中 2 行有我不感兴趣的奇数字符串。这是我拥有的数据类型。

   structure(list(id = c("1", "2", "3", "4", "5", "6", "7", "9", 
"8", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", 
"20"), health_care_worker = c("No", "No", "No", "No", "Yes", 
"No", "No", "Yes", "No", "No", "No", "No", "No", "No", "No", 
"No", "No", "No", "No", "No"), how_unwell = c(1, 6, 1, 1, 1, 
6, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), health_cnd = c("None", 
NA, NA, "Diabetes Type 2,No,Yes,Yes,No,4,No,Showing Symptoms But Not Tested,Mild,Yes,No,No,No,Spanish,No, No,No,Yes,No 3bad24c8-0ac9-4269-aa53-5e8d41b03142,35,Female,Rio de Janeiro", 
NA, NA, NA, NA, "High Blood Pressure (hypertension),No,Yes,No,No,0,No,Self-Isolating With No Symptoms,None,No,No,No,No,Portuguese,No, No,No,No,Yes 2656b3f2-d916-43e1-96b2-1d371d8c7b12,58,Female,Belém/ Pará", 
NA, NA, NA, NA, NA, NA, NA, NA, "High Blood Pressure (hypertension),No,Yes,No,No,15,No,Showing Symptoms But Not Tested,Moderate,Yes,No,No,No,Spanish,No, No,No,Yes,No 41cf840a-cfcc-441f-a995-f6b75ecee967,22,Male,Agb,India,2020-08-04 05:25:00,No,No,No,No,No,No,No,1,None,N", 
NA, NA), health_1 = c("None", "None", "None", "Diabetes Type 2,No, Asthma, Yes,Yes,No,4,No,Showing Symptoms But Not Tested,Mild,Yes,No,No,No,Spanish,No, No,No,Yes,No 3bad24c8-0ac9-4269-aa53-5e8d41b03142,35,Female,Rio de Janeiro", 
"None", "None", "None", "None", "High Blood Pressure (hypertension),No, Obesity, Yes,No,No,0,No,Self-Isolating With No Symptoms,None,No,No,No,No,Portuguese,No, No,No,No,Yes 2656b3f2-d916-43e1-96b2-1d371d8c7b12,58,Female,Belém/ Pará", 
"None", "None", "None", "None", "None", "None", "None", "None", 
"High Blood Pressure (hypertension),No,Lung-condition, Yes,No,No,15,No,Showing Symptoms But Not Tested,Moderate,Yes,No,No,No,Spanish,No, No,No,Yes,No 41cf840a-cfcc-441f-a995-f6b75ecee967,22,Male,Agb,India,2020-08-04 05:25:00,No,No,No,No,No,No,No,1,None,N", 
"None", "None")), row.names = c(NA, -20L), class = c("tbl_df", 
"tbl", "data.frame"))

这就是我想要我的新专栏的方式。

structure(list(id = c("1", "2", "3", "4", "5", "6", "7", "9", 
"8", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", 
"20"), health_care_worker = c("No", "No", "No", "No", "Yes", 
"No", "No", "Yes", "No", "No", "No", "No", "No", "No", "No", 
"No", "No", "No", "No", "No"), how_unwell = c(1, 6, 1, 1, 1, 
6, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), health_1 = c("None", 
"None", "None", "Diabetes Type 2,No, Asthma, Yes,Yes,No,4,No,Showing Symptoms But Not Tested,Mild,Yes,No,No,No,Spanish,No, No,No,Yes,No 3bad24c8-0ac9-4269-aa53-5e8d41b03142,35,Female,Rio de Janeiro", 
"None", "None", "None", "None", "High Blood Pressure (hypertension),No, Obesity, Yes,No,No,0,No,Self-Isolating With No Symptoms,None,No,No,No,No,Portuguese,No, No,No,No,Yes 2656b3f2-d916-43e1-96b2-1d371d8c7b12,58,Female,Belém/ Pará", 
"None", "None", "None", "None", "None", "None", "None", "None", 
"High Blood Pressure (hypertension),No,Lung-condition, Yes,No,No,15,No,Showing Symptoms But Not Tested,Moderate,Yes,No,No,No,Spanish,No, No,No,Yes,No 41cf840a-cfcc-441f-a995-f6b75ecee967,22,Male,Agb,India,2020-08-04 05:25:00,No,No,No,No,No,No,No,1,None,N", 
"None", "None"), copy_health_column = c("None", "None", "None", 
"Diabetes Type 2, Asthma", "None", "None", "None", "None", "High Blood Pressure (hypertension), Obesity", 
"None", "None", "None", "None", "None", "None", "None", "None", 
"High Blood Pressure (hypertension),Lung-condition", "None", 
"None")), row.names = c(NA, -20L), class = c("tbl_df", "tbl", 
"data.frame"))

现在,我的原始数据有超过 10 万个数据点。因此,我希望我能得到一个适用于更大数据集的解决方案。

标签: rstringtidyverse

解决方案


library(tidyverse)

df %>%
  rowwise() %>%
  mutate(copy_health_column =
           str_extract_all(health_1, pattern = "Diabetes Type 2|Asthma|Obesity|High Blood Pressure \\(hypertension\\)"),
         copy_health_column =  paste(copy_health_column, collapse = ","))

推荐阅读