首页 > 解决方案 > 如何为R中的每个因素计算几个间隔内的事件?

问题描述

我正在尝试计算每个间隔内发生的事件数以及我的每个因素(mystations)。

下面是一个MWE:

library(lubridate)

myintervals <- c(dmy_hms(
"01/01/2000 08:00:00",
"25/02/2000 09:00:00",
"01/03/2000 10:00:00",
"30/04/2000 11:00:00",
"01/05/2000 12:00:00",
"30/06/2000 13:00:00",
"01/07/2000 14:00:00",
"30/08/2000 15:00:00",
"01/09/2000 16:00:00",
"30/10/2000 17:00:00"))

mystations <- c("A","B","C","A","B","C","A","B","C","D")

mydata <- data.frame(myintervals,mystations)

           myintervals mystations
 |1  2000-01-01 08:00:00          A
 |2  2000-02-25 09:00:00          B
 |3  2000-03-01 10:00:00          C
 |4  2000-04-30 11:00:00          A
 |5  2000-05-01 12:00:00          B
 |6  2000-06-30 13:00:00          C
 |7  2000-07-01 14:00:00          A
 |8  2000-08-30 15:00:00          B
 |9  2000-09-01 16:00:00          C
 |10 2000-10-30 17:00:00          D

在这里,我正在创建检测

date.time <- c(dmy_hms(
"31/12/1999 08:00:00",
"24/02/2000 09:00:00",
"25/02/2000 08:00:00",
"26/02/2000 10:00:00",
"27/02/2000 11:00:00",
"01/03/2000 10:00:00",
"10/03/2000 22:00:00",
"20/03/2000 23:00:00",
"01/04/2000 10:00:00",
"20/04/2000 20:00:00",
"25/04/2000 08:00:00",
"30/04/2000 10:00:00",
"01/05/2000 12:00:00",
"10/05/2000 20:00:00",
"20/05/2000 08:00:00",
"30/06/2000 13:00:00",
"10/07/2000 10:00:00",
"20/07/2000 20:00:00",
"30/08/2000 15:00:00",
"01/09/2000 16:00:00"))

mydetections <- data.frame(date.time=date.time,mystations=mystations)

             date.time mystations
 |1  1999-12-31 08:00:00          A
 |2  2000-02-24 09:00:00          B
 |3  2000-02-25 08:00:00          C
 |4  2000-02-26 10:00:00          A
 |5  2000-02-27 11:00:00          B
 |6  2000-03-01 10:00:00          C
 |7  2000-03-10 22:00:00          A
 |8  2000-03-20 23:00:00          B
 |9  2000-04-01 10:00:00          C
 |10 2000-04-20 20:00:00          D
 |11 2000-04-25 08:00:00          A
 |12 2000-04-30 10:00:00          B
 |13 2000-05-01 12:00:00          C
 |14 2000-05-10 20:00:00          A
 |15 2000-05-20 08:00:00          B
 |16 2000-06-30 13:00:00          C
 |17 2000-07-10 10:00:00          A
 |18 2000-07-20 20:00:00          B
 |19 2000-08-30 15:00:00          C
 |20 2000-09-01 16:00:00          D

每个间隔的起源在这里:

myorigins <- data.frame(myintervals=c(
dmy_hms("01/01/1970 00:00:00","01/04/1970 00:00:00","01/08/1970 00:00:00","01/12/1970 00:00:00")),mystations=c(unique(mydata$mystations)))

预期的输出是这样的:

                                        myintervals mystation value
1  1970-01-01 00:00:00 UTC--2000-01-01 08:00:00 UTC         A     1
2  2000-01-01 08:00:00 UTC--2000-04-30 11:00:00 UTC         A     3
3  2000-04-30 11:00:00 UTC--2000-07-01 14:00:00 UTC         A     1
4  1970-04-01 00:00:00 UTC--2000-02-25 09:00:00 UTC         B     1
5  2000-02-25 09:00:00 UTC--2000-05-01 12:00:00 UTC         B     3
6  2000-05-01 12:00:00 UTC--2000-08-30 15:00:00 UTC         B     2
7  1970-08-01 00:00:00 UTC--2000-03-01 10:00:00 UTC         C     2
8  2000-03-01 10:00:00 UTC--2000-06-30 13:00:00 UTC         C     3
9  2000-06-30 13:00:00 UTC--2000-09-01 16:00:00 UTC         C     1
10 1970-12-01 00:00:00 UTC--2000-10-30 17:00:00 UTC         D     1

到目前为止,我能够实现的是:

#line by line
mydata <- add_row(mydata,myorigins)
mydata <- arrange(mydata,mystations,myintervals)
DF <- group_split(mydata,mystations)
Y <- lapply(seq_along(DF), function(x) as.data.frame(DF[[x]]))
names(Y) <- c(unique(mydata$mystations))
list2env(Y, envir = .GlobalEnv)

#splitting the detections
DFD <- group_split(mydetections,mystations)
Z <- lapply(seq_along(DFD), function(x) as.data.frame(DFD[[x]]))
names(Z) <- c(paste(unique(mydata$mystations),"det",sep=""))
list2env(Z, envir = .GlobalEnv)

我相信现在是时候“只”为每个数据帧构建间隔,如下所示:

Aint <- int_diff(A$myintervals)

并“检查”哪个检测属于哪个时间间隔:

myresA <- Adet$date.time%within%Aint

显然,我想避免“手动”构建每个 df As 的间隔。与往常一样,对于获得所需输出的任何帮助或提示,我将不胜感激。对于帖子中最初的混乱,我深表歉意。

标签: rloopsdatetimeintervals

解决方案


Here are a couple of options to consider - hope this may be helpful.

Using tidyverse you can add your myorigins to mydata, then after sorting with arrange make time intervals (start-end).

You can use fuzzy_left_join to add the events table, matching on mystations and where the date.time falls between the interval start and end.

Then, after grouping, you can count the number of rows. You will get something close to your result, depending on how you want to handle edge cases.

library(tidyverse)
library(fuzzyjoin)
library(lubridate)

bind_rows(mydata, myorigins) %>%
  arrange(myintervals) %>%
  group_by(mystations) %>%
  transmute(start = myintervals, end = lead(myintervals)) %>%
  filter(!is.na(end)) %>%
  fuzzy_left_join(
    mydetections,
    by = c("mystations", "start" = "date.time", "end" = "date.time"),
    match_fun = c(`==`, `<`, `>=`)
  ) %>%
    group_by(start, end, mystations.x) %>%
    summarise(count = n()) %>%
    arrange(mystations.x)

Output

   start               end                 mystations.x count
   <dttm>              <dttm>              <chr>        <int>
 1 1970-01-01 00:00:00 2000-01-01 08:00:00 A                1
 2 2000-01-01 08:00:00 2000-04-30 11:00:00 A                3
 3 2000-04-30 11:00:00 2000-07-01 14:00:00 A                1
 4 1970-04-01 00:00:00 2000-02-25 09:00:00 B                1
 5 2000-02-25 09:00:00 2000-05-01 12:00:00 B                3
 6 2000-05-01 12:00:00 2000-08-30 15:00:00 B                2
 7 1970-08-01 00:00:00 2000-03-01 10:00:00 C                2
 8 2000-03-01 10:00:00 2000-06-30 13:00:00 C                3
 9 2000-06-30 13:00:00 2000-09-01 16:00:00 C                1
10 1970-12-01 00:00:00 2000-10-30 17:00:00 D                2

An alternative to consider is using data.table which would be faster. One function that may be helpful here is foverlaps to find overlap between the event dates and date ranges.

library(data.table)

dt <- rbind(myorigins, mydata)
setDT(dt)

dt[, c("start", "end") := list(myintervals, lead(myintervals)), by = mystations]
dt <- na.omit(dt, "end")

setDT(mydetections)

mydetections[,date.time.copy := copy(date.time)]
setkey(mydetections, mystations, date.time, date.time.copy)

dt_ovlp <- foverlaps(dt, 
                     mydetections, 
                     by.x = c("mystations", "start", "end"),
                     by.y = c("mystations", "date.time", "date.time.copy"))

dt_ovlp[ , .(value = .N), by = c("mystations", "start", "end")][order(mystations, start)]

推荐阅读