首页 > 解决方案 > 查找一组事件的补集(时间间隔)

问题描述

给定在指定观察期内发生的一组事件(时间间隔),我试图找到没有发生事件的时间间隔。我们可以假设事件之间没有重叠。有没有比下面的方法更有效/更智能的方法?

测试 df

events <- data.frame(eventStartTime = c("2019-01-20 18:03:00", "2019-01-20 18:10:00", "2019-01-20 18:50:00"), 
                    eventEndTime = c("2019-01-20 18:05:00", "2019-01-20 18:20:00", "2019-01-20 18:55:00"))
events <- as.data.frame(lapply(events[,c('eventStartTime', 'eventEndTime')], as.POSIXct, format = "%Y-%m-%d %H:%M:%S", tz = "CET"))

预期输出

complementEvents <- data.frame(complementStartTime = c("2019-01-20 18:00:00", "2019-01-20 18:05:00", "2019-01-20 18:20:00", "2019-01-20 18:55:00"), 
                 complementEndTime = c("2019-01-20 18:03:00", "2019-01-20 18:10:00", "2019-01-20 18:50:00", "2019-01-20 19:00:00"))
complementEvents <- as.data.frame(lapply(complementEvents[,c('complementStartTime', 'complementEndTime')], as.POSIXct, format = "%Y-%m-%d %H:%M:%S", tz = "CET"))

我要实现的目标的可视化:

library(ggplot2)
options(stringsAsFactors = FALSE)

events$type <- rep("event", nrow(events))
complementEvents$type <- rep("complement event", nrow(complementEvents))
names(complementEvents) <- names(events)

observationStartTime <- as.POSIXct("2019-01-20 18:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "CET")
observationEndTime <- as.POSIXct("2019-01-20 19:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "CET")

ggplot(data = rbind(events, complementEvents)) +
geom_rect(mapping=aes(xmin=eventStartTime, xmax=eventEndTime, ymin=0,
                      ymax=0.5, fill = type), alpha = 0.4)+
scale_y_continuous(limits = c(0,0.5))+
scale_x_datetime(date_breaks = "10 min", 
                 date_labels = "%H:%M", 
                 limits = c(observationStartTime, observationEndTime))+
scale_fill_manual(values=c("event"="#1a75ce", "complement event"="#fdbb2f"))+
theme_minimal()+
theme(panel.grid.major.y =  element_blank(),
      panel.grid.minor.y =  element_blank(),
      axis.title = element_blank(),
      axis.text.y = element_blank(),
      text = element_text(size = 12),
      legend.position = "top")

我写了以下函数:

findComplementIntervals <- function(data, obsStartTime, obsEndTime){

# find time intervals of complement events given an observation time interval

complementEvents <- data.frame()

temp <- data.frame(complementStartTime = data$eventEndTime, complementEndTime = lead(data$eventStartTime))
if (data$eventStartTime[1] == obsStartTime & data$eventEndTime[nrow(data)] == obsEndTime){

    complementEvents <- temp[-nrow(temp),]

}else if (data$eventStartTime[1] == obsStartTime & data$eventEndTime[nrow(data)] < obsEndTime){

    temp$complementEndTime[nrow(temp)] <- obsEndTime
    complementEvents <- temp

}else if (data$eventStartTime[1] > obsStartTime & data$eventEndTime[nrow(data)] == obsEndTime){

    complementEvents <- temp[-nrow(temp),]
    complementEvents[nrow(complementEvents) + 1,] <- rep(NA,2)
    complementEvents$complementStartTime[nrow(complementEvents)] <- obsStartTime
    complementEvents$complementEndTime[nrow(complementEvents)] <- data$eventStartTime[1] 

}else{

    temp$complementEndTime[nrow(temp)] <- obsEndTime
    complementEvents <- temp
    complementEvents[nrow(complementEvents) + 1,] <- rep(NA,2)
    complementEvents$complementStartTime[nrow(complementEvents)] <- obsStartTime
    complementEvents$complementEndTime[nrow(complementEvents)] <- data$eventStartTime[1]

}

complementEvents <- complementEvents[order(complementEvents$complementStartTime),]
return(complementEvents)
}

是否有另一种方法可以找到更有效/更优雅的补充事件/时间间隔?

标签: r

解决方案


一种data.table方法

#set window to analyse
startTime = as.POSIXct( "2019-01-20 18:00:00", 
                        format = "%Y-%m-%d %H:%M:%S", tz = "CET")
endTime   = as.POSIXct( "2019-01-20 19:00:00", 
                        format = "%Y-%m-%d %H:%M:%S", tz = "CET")

library( data.table )
#create data.table with all minutes bewteen start - end
dt.mins <- data.table( minute = seq(startTime, endTime - 60, by = "1 mins"),
                       minute2 = seq(startTime +60 , endTime, by = "1 mins") )
#perform by-reference non-equi join
dt.mins[ setDT( events ), event := 1, on = c( "minute >= eventStartTime", 
                                              "minute < eventEndTime" ) ]
#set eventumber = 0 for minutes that fall outside events
dt.mins[ is.na( event ), event := 0 ]
#create group-numbers to summarise on later, using rleid()
dt.mins[, group := rleid( event ) ]
#summarise by group, on all rows (=minutes) that fall outside events
dt.mins[ event == 0, ][, list( complementStartTime = min( minute ),
                               complementEndTime = max( minute2 ) ),
                       by = .(group)][, group := NULL][]

输出

#    complementStartTime   complementEndTime
# 1: 2019-01-20 18:00:00 2019-01-20 18:03:00
# 2: 2019-01-20 18:05:00 2019-01-20 18:10:00
# 3: 2019-01-20 18:20:00 2019-01-20 18:50:00
# 4: 2019-01-20 18:55:00 2019-01-20 19:00:00

推荐阅读