r - 使用 left_join 提取数据时的困难
问题描述
请在下面找到我的数据。我遇到了两个问题。
我正在尝试将yy$n_otte
值合并到缺失h$n_otte
值中。我的方法是dplyr::left_join
匹配 by study
,os.neck
和between n_sygdom
and 。我需要将所有这些变量作为两者进行匹配,并包含两个大型电子表格。age
h
yy
h
yy
> head(h)
study os.neck age n_sygdom n_otte
1 B 49.00 53 0 N0
2 B 1.00 83 0 N0
3 A 76.44 63 2 <NA>
4 B 11.00 45 0 N0
5 A 9.21 37 15 <NA>
6 B 1.00 60 1 N1
和
> head(yy)
study os.neck n_sygdom age n_otte
1 A 42.12 0 63 N0
2 A 30.72 0 61 N0
3 A 136.20 0 48 N0
4 A 23.40 0 63 N0
5 A 5.16 3 67 N3b
6 A 33.96 0 58 N0
问题 1:为什么 as_integer() 会改变我的值?
> str(yy)
'data.frame': 643 obs. of 5 variables:
$ study : Factor w/ 1 level "A": 1 1 1 1 1 1 1 1 1 1 ...
$ os.neck : num 42.12 30.72 136.2 23.4 5.16 ...
$ n_sygdom: Factor w/ 22 levels "0","1","10","11",..: 1 1 1 1 13 1 11 11 2 1 ...
$ age : num 63 61 48 63 67 58 23 52 53 62 ...
$ n_otte : Factor w/ 6 levels "N0","N1","N2a",..: 1 1 1 1 6 1 6 4 3 1 ...
我在尝试着
yy <- yy %>% mutate(n_sygdom = as.integer(n_sygdom))
但是yy$n_sygdom
变化。
> head(yy)
study os.neck n_sygdom age n_otte
1 A 42.12 1 63 N0
2 A 30.72 1 61 N0
3 A 136.20 1 48 N0
4 A 23.40 1 63 N0
5 A 5.16 13 67 N3b
6 A 33.96 1 58 N0
问题
为什么会yy$n_sygdom
改变?我想包含yy$n_sygdom
为整数,但显然保留初始整数。
问题 2:left_join 匹配没有产生预期的输出
显然,首先需要解决问题 1,因为
a <- left_join(h, yy, by=c("study", "os.neck", "age", "n_sygdom"))
产量
由于类型不兼容(因子/整数),无法加入 'n_sygdom' x 'n_sygdom'
但是,我遇到的问题也在这里(没有n_sygdom
):
a <- left_join(h, yy, by=c("study", "os.neck", "age"))
> head(a)
study os.neck age n_sygdom.x n_otte.x n_sygdom.y n_otte.y
1 B 49.00 53 0 N0 <NA> <NA>
2 B 1.00 83 0 N0 <NA> <NA>
3 A 76.44 63 2 <NA> <NA> <NA>
4 B 11.00 45 0 N0 <NA> <NA>
5 A 9.21 37 15 <NA> 15 N3b
6 B 1.00 60 1 N1 <NA> <NA>
为什么.x
和.y
生成,以及我怎样才能获得a$n_otte
,独家?
预期产出
> head(a)
study os.neck age n_sygdom n_otte
1 B 49.00 53 0 N0
2 B 1.00 83 0 N0
3 A 76.44 63 2 <NA>
4 B 11.00 45 0 N0
5 A 9.21 37 15 N3b
6 B 1.00 60 1 N1
主要数据
h <- structure(list(study = c("B", "B", "A", "B", "A", "B", "A", "A",
"B", "B", "B", "B", "A", "A", "A", "A", "C", "B", "A", "C", "B",
"B", "B", "B", "A", "B", "A", "B", "A", "A", "C", "B", "B", "A",
"B", "C", "B", "B", "B", "C", "A", "C", "C", "B", "B", "C", "C",
"B", "B", "C", "C", "B", "B", "A", "B", "B", "C", "B", "C", "A",
"A", "C", "C", "A", "B", "B", "C", "B", "C", "C", "C", "B", "C",
"A", "B", "A", "B", "B", "C", "C", "B", "B", "B", "B", "C", "B",
"A", "A", "B", "C", "C", "B", "B", "B", "C", "B", "B", "B", "A",
"B"), os.neck = c(49, 1, 76.44, 11, 9.21, 1, 2.07, 4.08, 17,
11, 41, 38, 84.96, 5.64, 93.86, 11.52, 5.29, 61, 10.95, 3.68,
24, 63, 21, 68, 6.12, 7, 11.52, 48, 11.38, 73.68, 27.53, 12,
19, 17.98, 55, 77.77, 39, 4, 13, 57.56, 24.59, 46.55, 83.02,
14, 42, 49.58, 33.58, 33, 21, 29.96, 10.41, 67, 8, 94.72, 2,
1, 7.03, 1, 46.36, 23.76, 57.48, 14.49, 14.69, 39.62, 68, 5,
35.78, 75, 80.82, 54.24, 49.12, 87, 50.96, 2.4, 10, 7.2, 17,
10, 34.56, 104.08, 5, 28, 13, 17, 104.08, 29, 5.04, 54.96, 49,
4.27, 47.93, 5, 60, 47, 14.49, 3, 32, 23, 13.97, 32), age = c(53,
83, 63, 45, 37, 60, 52, 64, 53, 78, 67, 68, 45, 68, 43, 72, 65,
65, 59, 58, 51, 61, 62, 61, 60, 66, 64, 62, 56, 41, 51, 69, 42,
65, 59, 60, 58, 58, 63, 71, 60, 72, 58, 55, 63, 63, 49, 79, 54,
57, 50, 54, 77, 64, 74, 85, 50, 64, 60, 43, 67, 63, 68, 64, 65,
67, 57, 65, 47, 70, 61, 66, 63, 33, 75, 74, 80, 80, 62, 80, 55,
74, 78, 80, 80, 33, 34, 52, 51, 47, 51, 54, 74, 71, 63, 81, 53,
46, 48, 49), n_sygdom = c(0L, 0L, 2L, 0L, 15L, 1L, 8L, 6L, 1L,
0L, 3L, 0L, 6L, 5L, 2L, 0L, 1L, 0L, 8L, 0L, 0L, 0L, 0L, 0L, 8L,
5L, 20L, 0L, 4L, 5L, 1L, 1L, 0L, 5L, 0L, 0L, 1L, 0L, 9L, 0L,
5L, 0L, 0L, 2L, 1L, 1L, 8L, 0L, 1L, 23L, 0L, 1L, 0L, 2L, 1L,
6L, 2L, 0L, 2L, 3L, 1L, 1L, 0L, 6L, 0L, 0L, 1L, 0L, 1L, 0L, 1L,
0L, 1L, 3L, 0L, 10L, 0L, 0L, 1L, 1L, 1L, 0L, 3L, 0L, 1L, 0L,
1L, 0L, 0L, 0L, 0L, 4L, 0L, 0L, 1L, 8L, 0L, 0L, 3L, 0L), n_otte = structure(c(1L,
1L, NA, 1L, NA, 2L, NA, NA, 6L, 1L, 5L, 1L, NA, NA, NA, NA, NA,
1L, NA, NA, 1L, 1L, 1L, 1L, NA, 4L, NA, 1L, NA, NA, NA, 2L, 1L,
NA, 1L, NA, 2L, 1L, 5L, NA, NA, NA, NA, 6L, 2L, NA, NA, 1L, 2L,
NA, NA, 6L, 1L, NA, 6L, 5L, NA, 1L, NA, NA, NA, NA, NA, NA, 1L,
1L, NA, 1L, NA, NA, NA, 1L, NA, NA, 1L, NA, 6L, 1L, NA, NA, 2L,
1L, 6L, 6L, NA, 1L, NA, NA, 1L, NA, NA, 6L, 1L, 1L, NA, 6L, 1L,
1L, NA, 1L), .Label = c("N0", "N1", "N2a", "N2b", "N2c", "N3b"
), class = "factor")), row.names = c(NA, -100L), class = "data.frame")
要提取的数据
yy <- structure(list(study = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "A", class = "factor"),
os.neck = c(24.84, 24.84, 9.76, 98.28, 19.08, 111.48, 41.52,
47.28, 35.24, 6.38, 57.48, 39.78, 35.52, 70.08, 12.49, 19.33,
3.02, 40.77, 32.71, 40.08, 59.4, 52.18, 48.33, 1.38, 26.89,
35.52, 59.18, 5.04, 6.24, 80.65, 5.13, 49.84, 9.48, 3.25,
46.42, 25.15, 10.8, 23.76, 17.1, 27.6, 4.68, 12.3, 52.96,
49.97, 10.98, 44.64, 39.78, 10.8, 9.5, 20.19, 11.97, 22.88,
60.59, 85.15, 55.04, 7.2, 28.2, 33.96, 2.76, 4.77, 9.96,
1.38, 33.4, 27.29, 37.2, 36.36, 90.28, 53.65, 32.09, 17.98,
68.28, 7.63, 36.36, 22.32, 43.2, 9.36, 5.88, 14.79, 48.1,
45.24, 9.21, 110.01, 42.12, 0.3, 0.56, 11.88, 46.26, 59.15,
87.22, 11.93, 88.8, 36.36, 29.19, 14.07, 11.21, 16.08, 20.58,
3.48, 73.74, 45.72), n_sygdom = structure(c(2L, 2L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 16L, 1L, 1L, 1L, 4L, 11L, 1L, 2L, 1L, 1L,
2L, 11L, 1L, 1L, 13L, 2L, 1L, 18L, 1L, 1L, 1L, 11L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 13L, 1L, 1L, 1L, 11L, 1L,
16L, 13L, 1L, 11L, 2L, 1L, 16L, 1L, 13L, 1L, 2L, 2L, 11L,
1L, 2L, 17L, 2L, 1L, 1L, 8L, 1L, 1L, 1L, 11L, 1L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 16L, 1L, 11L, 1L, 1L, 2L, 1L), .Label = c("0",
"1", "10", "11", "12", "13", "14", "15", "17", "18", "2",
"20", "3", "35", "39", "4", "5", "6", "7", "8", "9", "number"
), class = "factor"), age = c(44, 44, 45, 51, 50, 59, 30,
49, 35, 53, 67, 79, 64, 68, 75, 55, 67, 42, 65, 37, 77, 32,
60, 80, 64, 64, 45, 72, 54, 37, 45, 47, 70, 54, 56, 45, 71,
43, 53, 38, 58, 76, 26, 50, 65, 67, 79, 71, 67, 56, 79, 47,
54, 35, 41, 62, 64, 58, 41, 65, 73, 80, 68, 47, 32, 49, 32,
45, 56, 33, 61, 33, 49, 52, 61, 75, 60, 55, 28, 82, 37, 32,
63, 86, 63, 60, 63, 54, 63, 84, 27, 55, 67, 59, 72, 63, 47,
62, 53, 34), n_otte = structure(c(3L, 3L, 1L, 1L, 1L, 1L,
1L, 3L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 6L, 1L, 1L, 1L, 6L, 4L, 1L, 3L, 1L, 1L, 3L, 5L, 1L,
1L, 4L, 3L, 1L, 6L, 1L, 1L, 1L, 6L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 6L, 1L, 1L, 1L, 6L, 1L, 6L, 6L, 1L, 4L, 2L,
1L, 6L, 1L, 3L, 1L, 3L, 2L, 6L, 1L, 2L, 6L, 2L, 1L, 1L, 6L,
1L, 1L, 1L, 4L, 1L, 3L, 3L, 2L, 1L, 1L, 1L, 1L, 6L, 1L, 6L,
1L, 1L, 2L, 1L), .Label = c("N0", "N1", "N2a", "N2b", "N2c",
"N3b"), class = "factor")), row.names = c(NA, -100L), class = "data.frame")
解决方案
问题1的解决方案:
要将因子转换为等效数字,您需要先转换为字符。factors
在内部存储为数字,因此当您直接将它们转换为数字时,它会返回它的内部数字表示。
这个例子可能会说清楚:
as.integer(factor(c(2, 10, 3, 0)))
[1] 2 4 3 1
as.integer(as.character(factor(c(2, 10, 3, 0))))
[1] 2 10 3 0
对于您的示例,请执行以下操作:
library(dplyr)
yy <- yy %>% mutate(n_sygdom = as.integer(as.character(n_sygdom)))
问题2的解决方案:
现在您可以加入h
andyy
并使用来获取和coalesce
之间的第一个非 NA 值。n_otte.x
n_otte.y
left_join(h, yy, by=c("study", "os.neck", "age", "n_sygdom")) %>%
mutate(n_otte = coalesce(n_otte.x, n_otte.y)) %>%
select(-n_otte.x,-n_otte.y)
推荐阅读
- php - 如何从控制器中的路由移动功能?
- sql - 需要sql中用户的总和
- mysql - mysql中的值超出范围双倍
- python - 使用熊猫对两列进行排序并为数据框中的排序值创建新列
- jquery - PlaceHolder 未显示在 Modal 的 Select2 下拉菜单中
- c# - 关注动态生成的 CollectionView 条目的变化
- angular - 轴 #0 的数据列不能是字符串类型(有时) - Google Charts
- mysql - 如何在mysql中获取所有具有自动增量主键的表?
- selenium - 如何从canvas元素管理的条形图、饼图等中获取文本或值?
- tensorflow - Tensorflow:如何为每个样本批量索引一个元素