r - 如何检查变量元组是否导致第二个元组列表中的匹配?
问题描述
我有一个数据框,其中包含在某些位置和时间d1
的观察结果。l
t
> head(d1, 3)
id l p t X
1 1 258 2016 2016-01-05 -1.158644
2 5 261 2016 2016-01-14 1.604873
3 2 261 2016 2016-01-20 -1.102002
在另一个数据框中p2
,我有位置的时间间隔t1:t2
,如果匹配位置l
的元组和时间间隔的元组,我想逐行检查。d1
p2
> head(p2, 3)
l p t1 t2
1 261 2016 2016-01-11 2016-01-25
2 261 2017 2017-02-27 2017-03-13
3 261 2017 2017-03-01 2017-03-15
在正的情况下,虚拟变量d1$match
的值应该是 1,在负的情况下是 0:
# [1] 0 1 1 ...
到目前为止,我的尝试是,首先,将两个数据帧折叠成字符串l
并p
进行比较,其次,检查是否t
位于t1:t2
.
但是,我想出的代码有点笨拙,如果句点不重叠,它只会或多或少地起作用,如p1
. "Date"
此外,由于课程似乎存在问题,还会发出警告。
> p1
l p t1 t2
1 261 2016 2016-01-11 2016-01-25
2 261 2017 2017-02-27 2017-03-13
4 258 2018 2018-01-09 2018-01-23
p <- p1
p.strg <- sapply(1:nrow(p), function(x) {
do.call(paste, c(p[x, c("l", "p")], sep = "|"))
})
sapply(1:nrow(d1), function(x) {
strg <- do.call(paste, c(d1[x, c("l", "p")], sep = "|"))
t.d <- d1[x, "t"]
t.p <- p[which(p.strg %in% strg), c("t1", "t2")]
return(as.integer(any(p.strg %in% strg) & t.d >= t.p[1] &
t.d <= t.p[2]))
})
# [1] 0 1 1 0 0 0 1 1 0 0 0 1 0 0 0
# There were 30 warnings (use warnings() to see them)
# warnings()
# Warning messages:
# 1: In FUN(X[[i]], ...) :
# Incompatible methods ("Ops.Date", "Ops.data.frame") for ">="
# ...
如果期间确实像 中那样重叠p2
,
p <- p2
p.strg <- sapply(1:nrow(p), function(x) {
do.call(paste, c(p[x, c("l", "p")], sep = "|"))
})
sapply(1:nrow(d1), function(x) {
strg <- do.call(paste, c(d1[x, c("l", "p")], sep = "|"))
t.d <- d1[x, "t"]
t.p <- p[which(p.strg %in% strg), c("t1", "t2")]
return(as.integer(any(p.strg %in% strg) & t.d >= t.p[1] &
t.d <= t.p[2]))
})
它根本不起作用:
Error in FUN(X[[i]], ...) :
(list) object cannot be coerced to type 'double'
In addition: There were 13 warnings (use warnings() to see them)
我想我有点迷路了。在基础 R中解决这个问题的更好方法是什么?
注意:我的原始数据更广泛(d1:20000 x 11,p2:1700 x 8),所以我需要一个有效的解决方案。
数据:
d1 <- structure(list(id = c(1L, 5L, 2L, 3L, 1L, 3L, 4L, 5L, 2L, 3L,
5L, 1L, 2L, 4L, 4L), l = c(258, 261, 261, 260, 258, 260, 261,
261, 259, 260, 261, 258, 259, 261, 261), p = c(2016, 2016, 2016,
2016, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2018, 2018, 2018,
2018), t = structure(c(16805, 16814, 16820, 16924, 17193, 17211,
17227, 17229, 17348, 17481, 17517, 17543, 17554, 17787, 17887
), class = "Date"), X = c(-1.15864442153663, 1.60487335898257,
-1.10200153102672, -0.823719007033067, 1.20944271845298, 0.790388149166713,
-1.0996495357495, -0.421449225963478, -0.243567712934607, -0.337415580767635,
-1.64590022554026, 2.11206142393207, -0.950235138478342, -2.08164602167738,
-1.88576409729638), match = c(0L, 1L, 1L, 0L, 0L, 0L, 1L, 1L,
0L, 0L, 0L, 1L, 0L, 0L, 0L)), row.names = c(NA, -15L), class = "data.frame")
p1 <- structure(list(l = c(261, 261, 258), p = c(2016, 2017, 2018),
t1 = structure(c(16811, 17224, 17540), class = "Date"), t2 = structure(c(16825,
17238, 17554), class = "Date")), row.names = c(1L, 2L, 4L
), class = "data.frame")
p2 <- structure(list(l = c(261, 261, 261, 258, 259, 261), p = c(2016,
2017, 2017, 2018, 2018, 2018), t1 = structure(c(16811, 17224,
17226, 17540, 17551, 17884), class = "Date"), t2 = structure(c(16825,
17238, 17240, 17554, 17565, 17898), class = "Date")), row.names = c(NA,
-6L), class = "data.frame")
解决方案
这是一个使用循环的原始解决方案:
d1[["match"]] <- 0L
for (i in seq_len(nrow(d1))) {
p2rows <- which(p2[["l"]] == d1[["l"]][i])
for (r in p2rows) { # If no location match, there will be nothing to loop over
if (d1[["t"]][i] >= with(p2[r,], t1) && d1[["t"]][i] <= with(p2[r,], t2)) {
d1[["match"]][i] <- 1L
break # Enough to find one match, we break out of the inner loop
}
}
}
id l p t X match
1 1 258 2016 2016-01-05 -1.1586444 0
2 5 261 2016 2016-01-14 1.6048734 1
3 2 261 2016 2016-01-20 -1.1020015 1
4 3 260 2016 2016-05-03 -0.8237190 0
5 1 258 2017 2017-01-27 1.2094427 0
6 3 260 2017 2017-02-14 0.7903881 0
7 4 261 2017 2017-03-02 -1.0996495 1
8 5 261 2017 2017-03-04 -0.4214492 1
9 2 259 2017 2017-07-01 -0.2435677 0
10 3 260 2017 2017-11-11 -0.3374156 0
11 5 261 2017 2017-12-17 -1.6459002 0
12 1 258 2018 2018-01-12 2.1120614 1
13 2 259 2018 2018-01-23 -0.9502351 1
14 4 261 2018 2018-09-13 -2.0816460 0
15 4 261 2018 2018-12-22 -1.8857641 1
编辑:稍微快一点的版本:
for (i in seq_len(nrow(d1))) {
p2rows <- which(p2[["l"]] == d1[["l"]][i])
if (any(d1[["t"]][i] >= with(p2[p2rows,], t1) & d1[["t"]][i] <= with(p2[p2rows,], t2))) {
d1[["match"]][i] <- 1L
}
}
编辑2:再次应该稍微快一点:
library(data.table)
sapply(
seq_len(nrow(d1)),
function(i) {
p2rows <- which(p2[["l"]] == d1[["l"]][i])
as.integer(any(between(d1[["t"]][i], p2[p2rows, "t1"], p2[p2rows, "t2"])))
}
)
推荐阅读
- jquery - 使用 jQuery 验证验证和访问多个页面上的表单
- javascript - 如何获取频道中每个语音用户的每一分钟?不和谐.js v13
- c# - 如何防止 Serilog 自动将每一个小步骤记录为信息?
- python - 如何在 3d 图形上绘制线性回归线,我可以让两者分别工作,但不能将它们组合成一个图
- c# - IhostedService 异常
- python - 为什么使用 **kwargs 时会出现键 'kwargs'?
- django - 同一应用程序中的 2 个 urls 文件可以在 Django 中具有相同的应用程序名称吗?
- python - 如何以正确的顺序绘制分组条形图
- azure - 为什么需要 On Premise Data Gateway 将 Azure Analysis Services 与具有专用终结点的 Azure 数据库连接起来?
- c# - 我有一个有效的代码,但我怎样才能让它更优雅