r - 为什么我没有在 R 中根据 id 和另一列获得正确的计数?
问题描述
我正在尝试使用 R 中的 tidyverse 库基于两列 id 和合并症(具有不同类型的合并症)来获得正确的合并症计数。我试图理解为什么我做错了?因为我应用了显而易见的,见下文:
这是数据的结构:
structure(list(id = c("133", "cd5", "392", "ffa", "6ed", "9a2",
"989", "870", "2d9", "f9e", "d36", "8f4", "fb8", "626", "8fb",
"aea", "aea", "af4", "162", "162"), Comorbidity_count = c("Comorbidity_one",
"Comorbidity_one", "Comorbidity_one", "Comorbidity_one", "Comorbidity_one",
"Comorbidity_one", "Comorbidity_one", "Comorbidity_one", "Comorbidity_one",
"Comorbidity_one", "Comorbidity_one", "Comorbidity_one", "Comorbidity_one",
"Comorbidity_one", "Comorbidity_one", "Comorbidity_one", "Comorbidity_two",
"Comorbidity_one", "Comorbidity_one", "Comorbidity_two"), Comorbidity = c("None",
"None", "None", "High Blood Pressure (hypertension)", "None",
"None", "None", "Asthma (managed with an inhaler)", "None", "None",
"None", "None", "None", "None", "None", "Diabetes Type 2", "Obesity",
"None", "High Blood Pressure (hypertension)", "Obesity")), row.names = c(NA,
-20L), groups = structure(list(id = c("133", "cd5", "392", "ffa",
"6ed", "9a2", "989", "870", "2d9", "f9e", "d36", "8f4", "fb8",
"626", "8fb", "aea", "aea", "af4", "162", "162"), .rows = structure(list(
7L, 6L, 16:17, 19:20, 11L, 3L, 4L, 5L, 8L, 2L, 14L, 9L, 15L,
10L, 12L, 13L, 18L, 1L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 18L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
如果我写下面的代码,我没有得到正确的计数:
count_id <- test %>%
naniar::replace_with_na(replace = list(Comorbidity = "None")) %>%
dplyr::group_by(id, Comorbidity) %>%
dplyr::mutate(number_morbidities = n())
结果应该如下表所示:
structure(list(id = c("133", "cd5", "392", "ffa", "6ed", "9a2",
"989", "870", "2d9", "f9e", "d36", "8f4", "fb8", "626", "8fb",
"aea", "aea", "af4", "162", "162"), Comorbidity_count = c("Comorbidity_one",
"Comorbidity_one", "Comorbidity_one", "Comorbidity_one", "Comorbidity_one",
"Comorbidity_one", "Comorbidity_one", "Comorbidity_one", "Comorbidity_one",
"Comorbidity_one", "Comorbidity_one", "Comorbidity_one", "Comorbidity_one",
"Comorbidity_one", "Comorbidity_one", "Comorbidity_one", "Comorbidity_two",
"Comorbidity_one", "Comorbidity_one", "Comorbidity_two"), Comorbidity = c(NA,
NA, NA, "High Blood Pressure (hypertension)", NA, NA, NA, "Asthma (managed with an inhaler)",
NA, NA, NA, NA, NA, NA, NA, "Diabetes Type 2", "Obesity", NA,
"High Blood Pressure (hypertension)", "Obesity"), number_morbidities = c(NA,
NA, NA, 1L, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, 2L, 2L,
NA, 2L, 2L)), row.names = c(NA, -20L), groups = structure(list(
id = c("133", "162", "2d9", "392", "626", "6ed", "870", "8f4",
"8fb", "989", "9a2", "aea", "af4", "cd5", "d36", "f9e", "fb8",
"ffa"), .rows = structure(list(1L, 19:20, 9L, 3L, 14L, 5L,
8L, 12L, 15L, 7L, 6L, 16:17, 18L, 2L, 11L, 10L, 13L,
4L), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr",
"list"))), row.names = c(NA, 18L), class = c("tbl_df", "tbl",
"data.frame"), .drop = TRUE), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"))
解决方案
您只需要分组id
,因为您想要每个 id 的计数,如果您想忽略没有合并症的 id,请使用不同的方法来计算合并症。n()
将计算所有行是否丢失。请注意,如果没有合并症,这种方法会产生 0,我认为这比NA
;更有意义。NA
如果需要,您可以将 0 替换为。请注意,我也跳过了naniar
依赖项,但这并没有改变任何东西。
library(tidyverse)
test <- structure(list(id = c("133", "cd5", "392", "ffa", "6ed", "9a2", "989", "870", "2d9", "f9e", "d36", "8f4", "fb8", "626", "8fb", "aea", "aea", "af4", "162", "162"), Comorbidity_count = c("Comorbidity_one", "Comorbidity_one", "Comorbidity_one", "Comorbidity_one", "Comorbidity_one", "Comorbidity_one", "Comorbidity_one", "Comorbidity_one", "Comorbidity_one", "Comorbidity_one", "Comorbidity_one", "Comorbidity_one", "Comorbidity_one", "Comorbidity_one", "Comorbidity_one", "Comorbidity_one", "Comorbidity_two", "Comorbidity_one", "Comorbidity_one", "Comorbidity_two"), Comorbidity = c("None", "None", "None", "High Blood Pressure (hypertension)", "None", "None", "None", "Asthma (managed with an inhaler)", "None", "None", "None", "None", "None", "None", "None", "Diabetes Type 2", "Obesity", "None", "High Blood Pressure (hypertension)", "Obesity")), row.names = c(NA, -20L), groups = structure(list(id = c("133", "cd5", "392", "ffa", "6ed", "9a2", "989", "870", "2d9", "f9e", "d36", "8f4", "fb8", "626", "8fb", "aea", "aea", "af4", "162", "162"), .rows = structure(list(7L, 6L, 16:17, 19:20, 11L, 3L, 4L, 5L, 8L, 2L, 14L, 9L, 15L, 10L, 12L, 13L, 18L, 1L), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr", "list"))), row.names = c(NA, 18L), class = c("tbl_df", "tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", "tbl_df", "tbl", "data.frame"))
test %>%
mutate(Comorbidity = if_else(Comorbidity == "None", NA_character_, Comorbidity)) %>%
group_by(id) %>%
mutate(number_morbidities = sum(!is.na(Comorbidity)))
#> # A tibble: 20 x 4
#> # Groups: id [18]
#> id Comorbidity_count Comorbidity number_morbidities
#> <chr> <chr> <chr> <int>
#> 1 133 Comorbidity_one <NA> 0
#> 2 cd5 Comorbidity_one <NA> 0
#> 3 392 Comorbidity_one <NA> 0
#> 4 ffa Comorbidity_one High Blood Pressure (hypertension) 1
#> 5 6ed Comorbidity_one <NA> 0
#> 6 9a2 Comorbidity_one <NA> 0
#> 7 989 Comorbidity_one <NA> 0
#> 8 870 Comorbidity_one Asthma (managed with an inhaler) 1
#> 9 2d9 Comorbidity_one <NA> 0
#> 10 f9e Comorbidity_one <NA> 0
#> 11 d36 Comorbidity_one <NA> 0
#> 12 8f4 Comorbidity_one <NA> 0
#> 13 fb8 Comorbidity_one <NA> 0
#> 14 626 Comorbidity_one <NA> 0
#> 15 8fb Comorbidity_one <NA> 0
#> 16 aea Comorbidity_one Diabetes Type 2 2
#> 17 aea Comorbidity_two Obesity 2
#> 18 af4 Comorbidity_one <NA> 0
#> 19 162 Comorbidity_one High Blood Pressure (hypertension) 2
#> 20 162 Comorbidity_two Obesity 2
由reprex 包(v0.3.0)于 2020 年 8 月 26 日创建