首页 > 解决方案 > R:函数或类似函数,用于汇总大型数据集中包含特定字符的列的非 NA 值的数量

问题描述

我有一个大数据集(907 x 1855)。我需要计算每位患者接受了多少次随访。后续列包含 或,1并且后续可以定义为特定列。2NA!is.na()

最多可进行 20 次跟进。如您所见,每个跟进都_vX添加了后缀,x对应于跟进的数量。

因此,follow-upnr20具有非常不方便的RedCap自动生成列名p$fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10_v11_v12_v13_v14_v15_v16_v17_v18_v19_v20

> head(p)
  fu_location fu_location_v2 fu_location_v2_v3 fu_location_v2_v3_v4    ...
1           1              1                 1                    1    ...
2           2              2                 1                    2    ...
3           1              1                 1                    2    ...
4           2              2                 2                    2    ...

我需要数数!is.na(for column names that contains "fu_location")。我试过mutate(n_fu = sum(!is.na(contains("fu_location"))))了,但没有奏效。

优选地,该解决方案是在dplyr。也许是一个功能?

预期输出:

> head(p)
  fu_location fu_location_v2 fu_location_v2_v3 fu_location_v2_v3_v4    n_fu
1           1              1                 1                    1       8
2           2              2                 1                    2      20
3           1              1                 1                    2       4
4           2              2                 2                    2       4

  

数据

p <- structure(list(fu_location = c(1L, 2L, 1L, 2L), fu_location_v2 = c(1L, 
2L, 1L, 2L), fu_location_v2_v3 = c(1L, 1L, 1L, 2L), fu_location_v2_v3_v4 = c(1L, 
2L, 2L, 2L), fu_location_v2_v3_v4_v5 = c(2L, 2L, NA, NA), fu_location_v2_v3_v4_v5_v6 = c(1L, 
2L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7 = c(2L, 1L, NA, NA
), fu_location_v2_v3_v4_v5_v6_v7_v8 = c(1L, 2L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9 = c(NA, 
2L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10 = c(NA, 
1L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10_v11 = c(NA, 
2L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10_v11_v12 = c(NA, 
1L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10_v11_v12_v13 = c(NA, 
2L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10_v11_v12_v13_v14 = c(NA, 
2L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10_v11_v12_v13_v14_v15 = c(NA, 
1L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10_v11_v12_v13_v14_v15_v16 = c(NA, 
2L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10_v11_v12_v13_v14_v15_v16_v17 = c(NA, 
1L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10_v11_v12_v13_v14_v15_v16_v17_v18 = c(NA, 
2L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10_v11_v12_v13_v14_v15_v16_v17_v18_v19 = c(NA, 
1L, NA, NA), fu_location_v2_v3_v4_v5_v6_v7_v8_v9_v10_v11_v12_v13_v14_v15_v16_v17_v18_v19_v20 = c(NA, 
2L, NA, NA)), row.names = c(NA, -4L), class = "data.frame")

标签: rfunctiondataframedplyr

解决方案


使用rowSums

library(dplyr)
p %>% mutate(n_fu =  rowSums(!is.na(select(., contains('fu_location')))))

或在基地:

p$n_fu <- rowSums(!is.na(p[grep('fu_location', names(p))]))

推荐阅读