r - 合并包含空 NA 值的多个列
问题描述
我有一个数据集,其中创建了多个列,但数据相同(boxID)。我想合并列,以便我只有 boxID(一个字母数字代码:两个字母的状态缩写和 2 个数字)而不是 NA 值,这就是现在我使用 unite() 函数时发生的情况dplyr。是否有类似的功能可以做到这一点,或者我是否需要根据与 stringr 的模式匹配来提取 boxID?
dat <- structure(list(boxId = c("CA04", "CA04", "CA01", "CA02", "CA04",
"CA02", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), boxId__1 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "NM01", "NM14", "NM15",
"NM16", "NM17", "NM18", "NM19", "NM20", "NM02", "NM03", "NM04",
"NM05", "NM06", "NM07", "NM08", "NM09", "NM10", "NM11", "NM12",
"NM13"), boxId__2 = c(NA, NA, NA, NA, NA, NA, "FL01", "FL02",
"FL03", "FL09", "FL08", "FL07", "FL04", "FL05", "FL06", "FL10",
"FL11", "FL13", "FL12", "FL20", "FL19", "FL18", "FL17", "FL16",
"FL14", "FL15", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), boxID = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), boxID__1 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
boxID__2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA), boxID__3 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, "IN05", NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), boxID__4 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA), boxID__5 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
boxID__6 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA), boxID__7 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
boxID__8 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA), boxID__9 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "WA11", NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
)), row.names = c(NA, -48L), class = c("tbl_df", "tbl", "data.frame"
))
数据如下所示:
# A tibble: 48 x 13
boxId boxId__1 boxId__2 boxID boxID__1 boxID__2 boxID__3 boxID__4 boxID__5 boxID__6
<chr> <chr> <chr> <lgl> <lgl> <lgl> <chr> <lgl> <lgl> <lgl>
1 CA04 NA NA NA NA NA NA NA NA NA
2 CA04 NA NA NA NA NA NA NA NA NA
3 CA01 NA NA NA NA NA NA NA NA NA
4 CA02 NA NA NA NA NA NA NA NA NA
5 CA04 NA NA NA NA NA NA NA NA NA
6 CA02 NA NA NA NA NA NA NA NA NA
7 NA NA FL01 NA NA NA NA NA NA NA
8 NA NA FL02 NA NA NA NA NA NA NA
9 NA NA FL03 NA NA NA NA NA NA NA
10 NA NA FL09 NA NA NA NA NA NA NA
# … with 38 more rows, and 3 more variables: boxID__7 <lgl>, boxID__8 <lgl>, boxID__9 <chr>
当我使用 unite() 时,它看起来像这样:
dat %>%
unite('newID')
而且我坚持使用这些 NA 值:
# A tibble: 48 x 1
newID
<chr>
1 CA04_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA
2 CA04_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA
3 CA01_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA
4 CA02_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA
5 CA04_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA
6 CA02_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA
7 NA_NA_FL01_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA
8 NA_NA_FL02_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA
解决方案
一个基本的 R 方法将是unlist
数据框中的所有值,并仅选择非 NA 值来创建一个具有一列的新数据框。
x <- unlist(dat)
data.frame(new_id = x[!is.na(x)])
# new_id
#boxId1 CA04
#boxId2 CA04
#boxId3 CA01
#boxId4 CA02
#boxId5 CA04
#boxId6 CA02
#boxId__129 NM01
#boxId__130 NM14
#boxId__131 NM15
#......
推荐阅读
- node.js - Mongodb 日期字段未在本地时区设置日期
- html - CSS下拉列表被包含并在div内切断
- vb.net - 加载程序集时出现 FileNotFoundException
- google-cloud-platform - 即使从 Google Cloud Scheduler HTTP 请求正确调用 Google Cloud Function 也不会执行
- solr - Solr:子文档的排名
- macos - 当我键入 exit 时,有什么方法可以退出 macOS 上的终端应用程序(如 cmd+q)?
- python - 如何在 cvxpy 中动态地对数组的子集施加约束?
- javascript - Javascript:在从 API 调用加载数据之前渲染 HTML
- reactjs - 上传到 gh-pages 时,使用 react-router 的 React 应用程序不会呈现主页组件
- dovecot - dovecot 创建虚拟文件夹(imap 搜索)