首页 > 解决方案 > 使用 left_join 提取数据时的困难

问题描述

请在下面找到我的数据。我遇到了两个问题。

我正在尝试将yy$n_otte值合并到缺失h$n_otte值中。我的方法是dplyr::left_join匹配 by study,os.neck和between n_sygdomand 。我需要将所有这些变量作为两者进行匹配,并包含两个大型电子表格。agehyyhyy

> head(h)
  study os.neck age n_sygdom n_otte
1     B   49.00  53        0     N0
2     B    1.00  83        0     N0
3     A   76.44  63        2   <NA>
4     B   11.00  45        0     N0
5     A    9.21  37       15   <NA>
6     B    1.00  60        1     N1

> head(yy)
  study os.neck n_sygdom age n_otte
1     A   42.12        0  63     N0
2     A   30.72        0  61     N0
3     A  136.20        0  48     N0
4     A   23.40        0  63     N0
5     A    5.16        3  67    N3b
6     A   33.96        0  58     N0

问题 1:为什么 as_integer() 会改变我的值?

> str(yy)
'data.frame':   643 obs. of  5 variables:
 $ study   : Factor w/ 1 level "A": 1 1 1 1 1 1 1 1 1 1 ...
 $ os.neck : num  42.12 30.72 136.2 23.4 5.16 ...
 $ n_sygdom: Factor w/ 22 levels "0","1","10","11",..: 1 1 1 1 13 1 11 11 2 1 ...
 $ age     : num  63 61 48 63 67 58 23 52 53 62 ...
 $ n_otte  : Factor w/ 6 levels "N0","N1","N2a",..: 1 1 1 1 6 1 6 4 3 1 ...

我在尝试着

yy <- yy %>% mutate(n_sygdom = as.integer(n_sygdom))

但是yy$n_sygdom变化。

> head(yy)
  study os.neck n_sygdom age n_otte
1     A   42.12        1  63     N0
2     A   30.72        1  61     N0
3     A  136.20        1  48     N0
4     A   23.40        1  63     N0
5     A    5.16       13  67    N3b
6     A   33.96        1  58     N0

问题 为什么会yy$n_sygdom改变?我想包含yy$n_sygdom为整数,但显然保留初始整数。

问题 2:left_join 匹配没有产生预期的输出

显然,首先需要解决问题 1,因为

a <- left_join(h, yy, by=c("study", "os.neck", "age", "n_sygdom"))

产量

由于类型不兼容(因子/整数),无法加入 'n_sygdom' x 'n_sygdom'

但是,我遇到的问题也在这里(没有n_sygdom):

a <- left_join(h, yy, by=c("study", "os.neck", "age"))

> head(a)
  study os.neck age n_sygdom.x n_otte.x n_sygdom.y n_otte.y
1     B   49.00  53          0       N0       <NA>     <NA>
2     B    1.00  83          0       N0       <NA>     <NA>
3     A   76.44  63          2     <NA>       <NA>     <NA>
4     B   11.00  45          0       N0       <NA>     <NA>
5     A    9.21  37         15     <NA>         15      N3b
6     B    1.00  60          1       N1       <NA>     <NA>

为什么.x.y生成,以及我怎样才能获得a$n_otte,独家?

预期产出

> head(a)
  study os.neck age   n_sygdom   n_otte 
1     B   49.00  53          0       N0       
2     B    1.00  83          0       N0       
3     A   76.44  63          2     <NA>       
4     B   11.00  45          0       N0      
5     A    9.21  37         15      N3b         
6     B    1.00  60          1       N1       

主要数据

  h <- structure(list(study = c("B", "B", "A", "B", "A", "B", "A", "A", 
"B", "B", "B", "B", "A", "A", "A", "A", "C", "B", "A", "C", "B", 
"B", "B", "B", "A", "B", "A", "B", "A", "A", "C", "B", "B", "A", 
"B", "C", "B", "B", "B", "C", "A", "C", "C", "B", "B", "C", "C", 
"B", "B", "C", "C", "B", "B", "A", "B", "B", "C", "B", "C", "A", 
"A", "C", "C", "A", "B", "B", "C", "B", "C", "C", "C", "B", "C", 
"A", "B", "A", "B", "B", "C", "C", "B", "B", "B", "B", "C", "B", 
"A", "A", "B", "C", "C", "B", "B", "B", "C", "B", "B", "B", "A", 
"B"), os.neck = c(49, 1, 76.44, 11, 9.21, 1, 2.07, 4.08, 17, 
11, 41, 38, 84.96, 5.64, 93.86, 11.52, 5.29, 61, 10.95, 3.68, 
24, 63, 21, 68, 6.12, 7, 11.52, 48, 11.38, 73.68, 27.53, 12, 
19, 17.98, 55, 77.77, 39, 4, 13, 57.56, 24.59, 46.55, 83.02, 
14, 42, 49.58, 33.58, 33, 21, 29.96, 10.41, 67, 8, 94.72, 2, 
1, 7.03, 1, 46.36, 23.76, 57.48, 14.49, 14.69, 39.62, 68, 5, 
35.78, 75, 80.82, 54.24, 49.12, 87, 50.96, 2.4, 10, 7.2, 17, 
10, 34.56, 104.08, 5, 28, 13, 17, 104.08, 29, 5.04, 54.96, 49, 
4.27, 47.93, 5, 60, 47, 14.49, 3, 32, 23, 13.97, 32), age = c(53, 
83, 63, 45, 37, 60, 52, 64, 53, 78, 67, 68, 45, 68, 43, 72, 65, 
65, 59, 58, 51, 61, 62, 61, 60, 66, 64, 62, 56, 41, 51, 69, 42, 
65, 59, 60, 58, 58, 63, 71, 60, 72, 58, 55, 63, 63, 49, 79, 54, 
57, 50, 54, 77, 64, 74, 85, 50, 64, 60, 43, 67, 63, 68, 64, 65, 
67, 57, 65, 47, 70, 61, 66, 63, 33, 75, 74, 80, 80, 62, 80, 55, 
74, 78, 80, 80, 33, 34, 52, 51, 47, 51, 54, 74, 71, 63, 81, 53, 
46, 48, 49), n_sygdom = c(0L, 0L, 2L, 0L, 15L, 1L, 8L, 6L, 1L, 
0L, 3L, 0L, 6L, 5L, 2L, 0L, 1L, 0L, 8L, 0L, 0L, 0L, 0L, 0L, 8L, 
5L, 20L, 0L, 4L, 5L, 1L, 1L, 0L, 5L, 0L, 0L, 1L, 0L, 9L, 0L, 
5L, 0L, 0L, 2L, 1L, 1L, 8L, 0L, 1L, 23L, 0L, 1L, 0L, 2L, 1L, 
6L, 2L, 0L, 2L, 3L, 1L, 1L, 0L, 6L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 
0L, 1L, 3L, 0L, 10L, 0L, 0L, 1L, 1L, 1L, 0L, 3L, 0L, 1L, 0L, 
1L, 0L, 0L, 0L, 0L, 4L, 0L, 0L, 1L, 8L, 0L, 0L, 3L, 0L), n_otte = structure(c(1L, 
1L, NA, 1L, NA, 2L, NA, NA, 6L, 1L, 5L, 1L, NA, NA, NA, NA, NA, 
1L, NA, NA, 1L, 1L, 1L, 1L, NA, 4L, NA, 1L, NA, NA, NA, 2L, 1L, 
NA, 1L, NA, 2L, 1L, 5L, NA, NA, NA, NA, 6L, 2L, NA, NA, 1L, 2L, 
NA, NA, 6L, 1L, NA, 6L, 5L, NA, 1L, NA, NA, NA, NA, NA, NA, 1L, 
1L, NA, 1L, NA, NA, NA, 1L, NA, NA, 1L, NA, 6L, 1L, NA, NA, 2L, 
1L, 6L, 6L, NA, 1L, NA, NA, 1L, NA, NA, 6L, 1L, 1L, NA, 6L, 1L, 
1L, NA, 1L), .Label = c("N0", "N1", "N2a", "N2b", "N2c", "N3b"
), class = "factor")), row.names = c(NA, -100L), class = "data.frame")

要提取的数据

    yy <- structure(list(study = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "A", class = "factor"), 
    os.neck = c(24.84, 24.84, 9.76, 98.28, 19.08, 111.48, 41.52, 
    47.28, 35.24, 6.38, 57.48, 39.78, 35.52, 70.08, 12.49, 19.33, 
    3.02, 40.77, 32.71, 40.08, 59.4, 52.18, 48.33, 1.38, 26.89, 
    35.52, 59.18, 5.04, 6.24, 80.65, 5.13, 49.84, 9.48, 3.25, 
    46.42, 25.15, 10.8, 23.76, 17.1, 27.6, 4.68, 12.3, 52.96, 
    49.97, 10.98, 44.64, 39.78, 10.8, 9.5, 20.19, 11.97, 22.88, 
    60.59, 85.15, 55.04, 7.2, 28.2, 33.96, 2.76, 4.77, 9.96, 
    1.38, 33.4, 27.29, 37.2, 36.36, 90.28, 53.65, 32.09, 17.98, 
    68.28, 7.63, 36.36, 22.32, 43.2, 9.36, 5.88, 14.79, 48.1, 
    45.24, 9.21, 110.01, 42.12, 0.3, 0.56, 11.88, 46.26, 59.15, 
    87.22, 11.93, 88.8, 36.36, 29.19, 14.07, 11.21, 16.08, 20.58, 
    3.48, 73.74, 45.72), n_sygdom = structure(c(2L, 2L, 1L, 1L, 
    1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 16L, 1L, 1L, 1L, 4L, 11L, 1L, 2L, 1L, 1L, 
    2L, 11L, 1L, 1L, 13L, 2L, 1L, 18L, 1L, 1L, 1L, 11L, 1L, 1L, 
    1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 13L, 1L, 1L, 1L, 11L, 1L, 
    16L, 13L, 1L, 11L, 2L, 1L, 16L, 1L, 13L, 1L, 2L, 2L, 11L, 
    1L, 2L, 17L, 2L, 1L, 1L, 8L, 1L, 1L, 1L, 11L, 1L, 2L, 2L, 
    2L, 1L, 1L, 1L, 1L, 16L, 1L, 11L, 1L, 1L, 2L, 1L), .Label = c("0", 
    "1", "10", "11", "12", "13", "14", "15", "17", "18", "2", 
    "20", "3", "35", "39", "4", "5", "6", "7", "8", "9", "number"
    ), class = "factor"), age = c(44, 44, 45, 51, 50, 59, 30, 
    49, 35, 53, 67, 79, 64, 68, 75, 55, 67, 42, 65, 37, 77, 32, 
    60, 80, 64, 64, 45, 72, 54, 37, 45, 47, 70, 54, 56, 45, 71, 
    43, 53, 38, 58, 76, 26, 50, 65, 67, 79, 71, 67, 56, 79, 47, 
    54, 35, 41, 62, 64, 58, 41, 65, 73, 80, 68, 47, 32, 49, 32, 
    45, 56, 33, 61, 33, 49, 52, 61, 75, 60, 55, 28, 82, 37, 32, 
    63, 86, 63, 60, 63, 54, 63, 84, 27, 55, 67, 59, 72, 63, 47, 
    62, 53, 34), n_otte = structure(c(3L, 3L, 1L, 1L, 1L, 1L, 
    1L, 3L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 6L, 1L, 1L, 1L, 6L, 4L, 1L, 3L, 1L, 1L, 3L, 5L, 1L, 
    1L, 4L, 3L, 1L, 6L, 1L, 1L, 1L, 6L, 1L, 1L, 1L, 1L, 2L, 1L, 
    1L, 1L, 1L, 1L, 6L, 1L, 1L, 1L, 6L, 1L, 6L, 6L, 1L, 4L, 2L, 
    1L, 6L, 1L, 3L, 1L, 3L, 2L, 6L, 1L, 2L, 6L, 2L, 1L, 1L, 6L, 
    1L, 1L, 1L, 4L, 1L, 3L, 3L, 2L, 1L, 1L, 1L, 1L, 6L, 1L, 6L, 
    1L, 1L, 2L, 1L), .Label = c("N0", "N1", "N2a", "N2b", "N2c", 
    "N3b"), class = "factor")), row.names = c(NA, -100L), class = "data.frame")

标签: rdataframedplyrleft-join

解决方案


问题1的解决方案:

要将因子转换为等效数字,您需要先转换为字符。factors在内部存储为数字,因此当您直接将它们转换为数字时,它会返回它的内部数字表示。

这个例子可能会说清楚:

as.integer(factor(c(2, 10, 3, 0)))
[1] 2 4 3 1
as.integer(as.character(factor(c(2, 10, 3, 0))))
[1]  2 10  3  0

对于您的示例,请执行以下操作:

library(dplyr)
yy <- yy %>% mutate(n_sygdom = as.integer(as.character(n_sygdom)))

问题2的解决方案:

现在您可以加入handyy并使用来获取和coalesce之间的第一个非 NA 值。n_otte.xn_otte.y

left_join(h, yy, by=c("study", "os.neck", "age", "n_sygdom")) %>%
  mutate(n_otte = coalesce(n_otte.x, n_otte.y)) %>%
  select(-n_otte.x,-n_otte.y)

推荐阅读