r - INNER JOIN 返回不需要的匹配,SEMI JOIN 返回预期的输出,但缺少列
问题描述
我现在面临这个问题一段时间,希望有人可以帮助我解决它。
我有两个数据框。
第一个包含有关每个客户拨打的电话的数据:
Calls <- structure(list(OpenUser = c(55555, 33333, 22222, 44444, 22222,
55555), OpenFirstName = c("Shir", "Yael", "Yair", "Roni", "Yair",
"Shir"), OpenLastName = c("Shiran", "Eilon", "Yaron", "Ron",
"Yaron", "Shiran"), CustomerID = c(836, 1070, 1390, 2970, 3646,
3646), CRMEventStartDate = structure(c(1441065600, 1441065600,
1431129600, 1435881600, 1417392000, 1441497600), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), NumOfOptions = c(1L, 1L, 2L, 3L, 3L,
3L)), row.names = c(NA, -6L), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), groups = structure(list(OpenUser = c(22222,
22222, 33333, 44444, 55555, 55555), OpenFirstName = c("Yair",
"Yair", "Yael", "Roni", "Shir", "Shir"), OpenLastName = c("Yaron",
"Yaron", "Eilon", "Ron", "Shiran", "Shiran"), CustomerID = c(1390,
3646, 1070, 2970, 836, 3646), .rows = list(3L, 5L, 2L, 4L, 1L,
6L)), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"), .drop = TRUE))
第二个包含提供给每个客户的活动的数据,以及响应:
Response <- structure(list(CampaignStrategyID = c(512345, 512345, 512345,
121212, 512345, 121212), CustomerID = c(836, 1070, 1390, 2970,
3479, 3646), ResponseDate = structure(c(1441065600, 1441065600,
1431129600, 1435881600, 1420502400, 1417392000), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), ResponseCode = c(1, 1, 1, 3, 2, 1)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
我必须为客户的每个电话展示,实际提供的活动是什么,以及他们的反应是什么。
CustomerID 的 INNER JOIN 导致不希望的匹配(不仅是提供的实际活动)
SEMI JOIN 产生所需的匹配项(我认为),但没有来自第二个数据帧(CampaignStrategyID 和 ResponseCode)的所需列。SEMI JOIN 完成后,我没有设法添加此列。
希望有人能帮忙。
提前致谢。
解决方案
更新 2019-03-03
这将使用提供的新数据进行更新。为了排除故障,我首先将示例数据过滤到记录的问题案例CustomerID
7033。
library(tidyverse)
library(lubridate)
Calls <- tibble(
OpenUser = c(55555, 33333, 22222, 44444, 22222, 55555, 55555, 11111, 11111,
44444, 44444, 11111, 44444, 44444, 33333, 44444, 11111, 33333,
44444, 22222),
OpenFirstName = c("Shir", "Yael", "Yair", "Roni", "Yair", "Shir", "Shir",
"Sigal", "Sigal", "Roni", "Roni", "Sigal", "Roni", "Roni",
"Yael", "Roni", "Sigal", "Yael", "Roni", "Yair"),
OpenLastName = c("Shiran", "Eilon", "Yaron", "Ron", "Yaron", "Shiran",
"Shiran", "segal", "segal", "Ron", "Ron", "segal", "Ron",
"Ron", "Eilon", "Ron", "segal", "Eilon", "Ron", "Yaron"),
CustomerID = c(836, 1070, 1390, 2970, 3646, 3646, 4542, 7033, 7033, 8838,
8838, 9040, 9040, 9973, 9973, 17472, 17472, 20409, 21626,
21632),
CRMEventStartDate = ymd(c("2015-09-01", "2015-09-01", "2015-05-09",
"2015-07-03", "2014-12-01", "2015-09-06",
"2015-07-01", "2015-05-02", "2015-07-03",
"2015-06-04", "2015-07-08", "2015-03-01",
"2015-05-06", "2015-05-31", "2015-08-09",
"2015-04-01", "2015-08-02", "2015-04-01",
"2015-04-04", "2015-02-01")),
NumOfOptions = c(1L, 1L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L)
)
Response <- tibble(
CampaignStrategyID = c(512345, 512345, 512345, 121212, 512345, 121212, 516345,
516345, 121212, 512345, 121212, 512345, 121212, 512345,
516345, 512345, 512345, 512345, 121212, 516345, 512345,
512345, 516345, 512345, 516345, 512345, 512345, 121212,
121212, 512345, 121212, 512345, 516345, 516345, 512345,
121212, 121212, 121212, 512345, 512345, 121212, 516345,
121212, 121212, 516345, 516345, 512345, 121212, 121212,
121212),
CustomerID = c(836, 1070, 1390, 2970, 3479, 3646, 3646, 4278, 4542, 7033,
7033, 8838, 8838, 9040, 9040, 9973, 9973, 17472, 17472, 20409,
21626, 21632, 22414, 24005, 24005, 26432, 26432, 28101, 28101,
31536, 31536, 31598, 31598, 32979, 32979, 35099, 37471, 37471,
38832, 38832, 39778, 40318, 40318, 46547, 48885, 48885, 49523,
49585, 55134, 56878),
ResponseDate = ymd(c("2015-09-01", "2015-09-01", "2015-05-09", "2015-07-03",
"2015-01-06", "2014-12-01", "2015-09-06", "2015-01-02",
"2015-07-01", "2015-05-02", "2015-07-03", "2015-06-04",
"2015-07-08", "2015-03-01", "2015-05-06", "2015-05-31",
"2015-08-09", "2015-04-01", "2015-08-02", "2015-04-01",
"2015-04-04", "2015-02-01", "2015-01-31", "2015-05-08",
"2015-06-08", "2015-02-08", "2015-05-09", "2015-01-08",
"2015-08-04", "2015-06-06", "2015-06-09", "2015-05-05",
"2015-05-08", "2015-02-04", "2015-09-01", "2014-12-02",
"2014-12-04", "2015-07-02", "2015-05-08", "2015-09-01",
"2015-07-03", "2015-04-03", "2015-06-02", "2014-12-09",
"2015-01-03", "2015-09-09", "2015-06-04", "2015-08-06",
"2015-06-09", "2015-02-06")),
ResponseCode = c(1, 1, 1, 3, 2, 1, 4, 2, 3, 1, 3, 2, 1, 3, 1, 1, 1, 4, 1, 4,
1, 2, 2, 1, 2, 2, 1, 2, 1, 2, 3, 1, 1, 2, 1, 1, 1, 3, 1, 1,
3, 1, 1, 2, 2, 4, 1, 1, 3, 2)
)
test_calls <- filter(Calls, CustomerID == 7033)
test_calls
#> # A tibble: 2 x 6
#> OpenUser OpenFirstName OpenLastName CustomerID CRMEventStartDa…
#> <dbl> <chr> <chr> <dbl> <date>
#> 1 11111 Sigal segal 7033 2015-05-02
#> 2 11111 Sigal segal 7033 2015-07-03
#> # … with 1 more variable: NumOfOptions <int>
test_response <- filter(Response, CustomerID == 7033)
test_response
#> # A tibble: 2 x 4
#> CampaignStrategyID CustomerID ResponseDate ResponseCode
#> <dbl> <dbl> <date> <dbl>
#> 1 512345 7033 2015-05-02 1
#> 2 121212 7033 2015-07-03 3
使用test_calls
andtest_response
数据,我能够重现该问题:
inner_join(test_calls, test_response, by = "CustomerID")
#> # A tibble: 4 x 9
#> OpenUser OpenFirstName OpenLastName CustomerID CRMEventStartDa…
#> <dbl> <chr> <chr> <dbl> <date>
#> 1 11111 Sigal segal 7033 2015-05-02
#> 2 11111 Sigal segal 7033 2015-05-02
#> 3 11111 Sigal segal 7033 2015-07-03
#> 4 11111 Sigal segal 7033 2015-07-03
#> # … with 4 more variables: NumOfOptions <int>, CampaignStrategyID <dbl>,
#> # ResponseDate <date>, ResponseCode <dbl>
现在我可以看到这是由于匹配发生的方式而发生的。因为我们只匹配CustomerID
,所有匹配的行都在Calls
其中有对应的行Response
,所以你得到了Calls
x 中的 2 行中的2 行的所有组合Response
(导致 4 行)。
为了纠正这个问题,我们还可以匹配日期变量(CRMEventStartDate
和ResponseDate
):
inner_join(test_calls, test_response,
by = c("CustomerID", "CRMEventStartDate" = "ResponseDate"))
#> # A tibble: 2 x 8
#> OpenUser OpenFirstName OpenLastName CustomerID CRMEventStartDa…
#> <dbl> <chr> <chr> <dbl> <date>
#> 1 11111 Sigal segal 7033 2015-05-02
#> 2 11111 Sigal segal 7033 2015-07-03
#> # … with 3 more variables: NumOfOptions <int>, CampaignStrategyID <dbl>,
#> # ResponseCode <dbl>
这会产生预期的 2 行,响应与CustomerID
事件的日期和日期相匹配。
原帖
我认为inner_join
应该给你你想要的。将返回和中的inner_join
所有列,但仅返回两者中都具有匹配变量 ( ) 的行。因此,例如,数据中不存在 3749 ,因此连接数据中没有它们的行。相反,3646 在数据中有两行,因此它们在连接数据中有 2 行。Calls
Response
CustomerID
CustomerID
Calls
CustomerID
Calls
library(tidyverse)
Calls <- structure(list(OpenUser = c(55555, 33333, 22222, 44444, 22222, 55555),
OpenFirstName = c("Shir", "Yael", "Yair", "Roni", "Yair", "Shir"),
OpenLastName = c("Shiran", "Eilon", "Yaron", "Ron", "Yaron", "Shiran"),
CustomerID = c(836, 1070, 1390, 2970, 3646, 3646),
CRMEventStartDate = structure(c(1441065600, 1441065600, 1431129600, 1435881600, 1417392000, 1441497600),
class = c("POSIXct", "POSIXt"), tzone = "UTC"),
NumOfOptions = c(1L, 1L, 2L, 3L, 3L, 3L)),
row.names = c(NA, -6L),
class = c("grouped_df", "tbl_df", "tbl", "data.frame"),
groups = structure(list(OpenUser = c(22222, 22222, 33333, 44444, 55555, 55555),
OpenFirstName = c("Yair", "Yair", "Yael", "Roni", "Shir", "Shir"),
OpenLastName = c("Yaron", "Yaron", "Eilon", "Ron", "Shiran", "Shiran"),
CustomerID = c(1390, 3646, 1070, 2970, 836, 3646),
.rows = list(3L, 5L, 2L, 4L, 1L, 6L)),
row.names = c(NA, -6L),
class = c("tbl_df", "tbl", "data.frame"),
.drop = TRUE))
Response <- structure(list(CampaignStrategyID = c(512345, 512345, 512345, 121212, 512345, 121212),
CustomerID = c(836, 1070, 1390, 2970, 3479, 3646),
ResponseDate = structure(c(1441065600, 1441065600, 1431129600, 1435881600, 1420502400, 1417392000),
class = c("POSIXct", "POSIXt"), tzone = "UTC"),
ResponseCode = c(1, 1, 1, 3, 2, 1)),
row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
inner_join(Calls, Response)
#> Joining, by = "CustomerID"
#> # A tibble: 6 x 9
#> # Groups: OpenUser, OpenFirstName, OpenLastName, CustomerID [6]
#> OpenUser OpenFirstName OpenLastName CustomerID CRMEventStartDate
#> <dbl> <chr> <chr> <dbl> <dttm>
#> 1 55555 Shir Shiran 836 2015-09-01 00:00:00
#> 2 33333 Yael Eilon 1070 2015-09-01 00:00:00
#> 3 22222 Yair Yaron 1390 2015-05-09 00:00:00
#> 4 44444 Roni Ron 2970 2015-07-03 00:00:00
#> 5 22222 Yair Yaron 3646 2014-12-01 00:00:00
#> 6 55555 Shir Shiran 3646 2015-09-06 00:00:00
#> # … with 4 more variables: NumOfOptions <int>, CampaignStrategyID <dbl>,
#> # ResponseDate <dttm>, ResponseCode <dbl>
由reprex 包(v0.2.1)于 2019 年 2 月 26 日创建
推荐阅读
- java - 如何在多个类中使用父类对象的结果
- reactjs - 强制 csv 使用十进制格式
- javascript - 比较数组中的 JavaScript 对象时出现问题
- c - 代码 139(被信号 11:SIGSEGV 中断),我是 C 的初学者
- typescript - 打字稿类型的可选链接
- reverse-proxy - 从 Thorntail 到 Quarkus 的代理地址转发属性
- python - 使用 Python3.9 和 BeautifulSoup 4 进行 JSONDecodeError 网页抓取
- c++ - 在 C++ 中使用圆括号和方括号创建动态数组的区别
- java - maven-dependency-plugin 未显示测试范围的依赖项
- javascript - 如果类组件中的条件发生反应,如何在内部调用函数?