首页 > 解决方案 > 如何通过多个列为每个 id 连接表

问题描述

我被迫删除了我之前的主题,因为它没有很好地询问并且示例有点复杂,所以这里是一个简单的。

我有 2 个数据框:

DF1<-data.frame(id1=c(1,1,1,1,1,2),client_code=c("x1","x1","x1","x2","x2","x3"),id2=c("a","b","c","d","e","y"),value1=c(0.1,0.2,0.3,0.4,0.5,0.6),value2=c(1.1,1.2,1.3,1.4,1.5,1.6))

> DF1
  id1 client_code id2 value1 value2
1   1          x1   a    0.1    1.1
2   1          x1   b    0.2    1.2
3   1          x1   c    0.3    1.3
4   1          x2   d    0.4    1.4
5   1          x2   e    0.5    1.5
6   2          x3   y    0.6    1.6

DF2<-data.frame(id1=c(1,1,1,1,1,1,2,2),id2=c("a","b","c","d","e","f","x","y"),value1=c(10,11,12,13,14,15,16,17),value2=c(20,21,22,23,24,25,26,27))

> DF2
  id1 id2 value1 value2
1   1   a     10     20
2   1   b     11     21
3   1   c     12     22
4   1   d     13     23
5   1   e     14     24
6   1   f     15     25
7   2   x     16     26
8   2   y     17     27

每个客户端属于一组客户端,由列 (id1) 标识

我想要做的是添加 DF2 中缺失的行,它们的 id2 在 DF1 中不存在。应该为属于 DF1 中同一组客户端 (id1) 的每个客户端 (client_code) 完成此过程。

(不知道我说得够不够清楚)

所需的输出:

output<-data.frame(id1=c(1,1,1,1,1,1,1,1,1,1,1,1,2,2),client_code=c("x1","x1","x1","x1","x1","x1","x2","x2","x2","x2","x2","x2","x3","x3"),id2=c("a","b","c","d","e","f","d","e","a","b","c","f","y","x"),                  value1=c(0.1,0.2,0.3,13,14,15,0.4,0.5,10,11,12,15,0.6,16),value2=c(1.1,1.2,1.3,23,24,25,1.4,1.5,20,21,22,25,1.6,26))

> output
   id1 client_code id2 value1 value2
1    1          x1   a    0.1    1.1
2    1          x1   b    0.2    1.2
3    1          x1   c    0.3    1.3
4    1          x1   d   13.0   23.0
5    1          x1   e   14.0   24.0
6    1          x1   f   15.0   25.0
7    1          x2   d    0.4    1.4
8    1          x2   e    0.5    1.5
9    1          x2   a   10.0   20.0
10   1          x2   b   11.0   21.0
11   1          x2   c   12.0   22.0
12   1          x2   f   15.0   25.0
13   2          x3   y    0.6    1.6
14   2          x3   x   16.0   26.0

提前致谢。

标签: rjoingrouping

解决方案


首先,创建一个表格,为每个客户显示id2应该存在的值:

library(tidyverse)

client_defaults <- df1 %>% 
  distinct(client_code, id1) %>% 
  left_join(df2, by = "id1")

client_defaults
#>    client_code id1 id2 value1 value2
#> 1           x1   1   a     10     20
#> 2           x1   1   b     11     21
#> 3           x1   1   c     12     22
#> 4           x1   1   d     13     23
#> 5           x1   1   e     14     24
#> 6           x1   1   f     15     25
#> 7           x2   1   a     10     20
#> 8           x2   1   b     11     21
#> 9           x2   1   c     12     22
#> 10          x2   1   d     13     23
#> 11          x2   1   e     14     24
#> 12          x2   1   f     15     25
#> 13          x3   2   x     16     26
#> 14          x3   2   y     17     27

然后,删除 中已存在的行df1,并添加其余行:

client_missing <- client_defaults %>% 
  anti_join(df1, by = c("client_code", "id2"))

bind_rows(df1, client_missing) %>% arrange(client_code)
#>    client_code id1 id2 value1 value2
#> 1           x1   1   a    0.1    1.1
#> 2           x1   1   b    0.2    1.2
#> 3           x1   1   c    0.3    1.3
#> 4           x1   1   d   13.0   23.0
#> 5           x1   1   e   14.0   24.0
#> 6           x1   1   f   15.0   25.0
#> 7           x2   1   d    0.4    1.4
#> 8           x2   1   e    0.5    1.5
#> 9           x2   1   a   10.0   20.0
#> 10          x2   1   b   11.0   21.0
#> 11          x2   1   c   12.0   22.0
#> 12          x2   1   f   15.0   25.0
#> 13          x3   2   y    0.6    1.6
#> 14          x3   2   x   16.0   26.0

数据:

df1 <- data.frame(client_code = c("x1", "x1", "x1", "x2", "x2", "x3"), id1 = c(1, 1, 1, 1, 1, 2), id2 = c("a", "b", "c", "d", "e", "y"), value1 = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6), value2 = c(1.1, 1.2, 1.3, 1.4, 1.5, 1.6), stringsAsFactors = FALSE)

df2 <- data.frame(id1 = c(1, 1, 1, 1, 1, 1, 2, 2), id2 = c("a", "b", "c", "d", "e", "f", "x", "y"), value1 = c(10, 11, 12, 13, 14, 15, 16, 17), value2 = c(20, 21, 22, 23, 24, 25, 26, 27), stringsAsFactors = FALSE)

reprex 包(v0.2.1)于 2019 年 7 月 1 日创建


推荐阅读