r - 查找发动机的连续运行天数
问题描述
我想计算一个发动机的连续运行天数:实际上是几个发动机,但我将我的 MRE 限制在一个发动机上,希望你能向我展示一个可以很容易地适应多个发动机情况的解决方案。为此,我在不同时间对不同引擎进行了一些传感器测量。通常这些传感器测量非常频繁,但它们有时会失败,因此采样率可能不是恒定的。但是,如果两次连续测量之间的时间差大于 1 天,这意味着发动机已停止,运行天数必须重新从 0 开始。例子:
library(lubridate)
library(dplyr)
library(tibble)
set.seed(3)
# a single engine
a_day <- 6
n <- a_day*10
engine <- factor(rep("engine_A", each = n))
end_date <- as_datetime("2018-09-13 19:26:29")
start_date <- end_date - n * hours(4)
date_time <- seq(start_date, end_date, length.out = n)
x <- runif(n)
y <- rnorm(n)
my_df <- data.frame(engine, date_time, x, y)
# short stops don't restart the running days count
short_stops <- sample(seq_len(n), 5)
# long stops, however, do
medium_stop <- sample(seq_len(n), 1)
medium_stop <- rep(medium_stop, each = a_day) + (-3:2)
long_stop <- seq(30,40)
# merge stop indices
index <- sort(unique(c(short_stops, medium_stop, long_stop)))
# remove the rows corresponding to the stops
my_df <- my_df[-index, ]
由于在这种情况下我定义了停止位置,我可以计算run_days
为:
my_df <- my_df %>%
rowid_to_column() %>%
mutate(run_days = as.integer(round(case_when(
rowid < 14 ~ (date_time - start_date) / ddays(1),
rowid < 20 ~ (date_time - date_time[14]) / ddays(1),
rowid >= 20 ~ (date_time - date_time[20]) / ddays(1)))))
IE
> my_df
rowid engine date_time x y run_days
1 1 engine_A 2018-09-03 19:26:29 0.16804153 0.900624729 0
2 2 engine_A 2018-09-03 23:30:33 0.80751640 0.851770447 0
3 3 engine_A 2018-09-04 03:34:37 0.38494235 0.727715174 0
4 4 engine_A 2018-09-04 07:38:41 0.32773432 0.736502146 1
5 5 engine_A 2018-09-04 11:42:45 0.60210067 -0.352129617 1
6 6 engine_A 2018-09-04 19:50:53 0.12463344 1.300357989 1
7 7 engine_A 2018-09-04 23:54:57 0.29460092 0.038252014 1
8 8 engine_A 2018-09-05 03:59:01 0.57760992 -0.979283770 1
9 9 engine_A 2018-09-05 12:07:09 0.51201590 0.786506872 2
10 10 engine_A 2018-09-05 20:15:17 0.53403535 1.698884846 2
11 11 engine_A 2018-09-06 00:19:21 0.55724944 -0.794593709 2
12 12 engine_A 2018-09-06 08:27:30 0.82970869 -2.265401074 3
13 13 engine_A 2018-09-06 12:31:34 0.11144915 -0.162205279 3
14 14 engine_A 2018-09-07 17:00:02 0.09338193 -1.737263711 0
15 15 engine_A 2018-09-07 21:04:06 0.23688501 -1.411425136 0
16 16 engine_A 2018-09-08 01:08:10 0.79114741 -0.453551227 0
17 17 engine_A 2018-09-08 05:12:14 0.59973157 -1.035491275 1
18 18 engine_A 2018-09-08 09:16:18 0.91014771 1.362142893 1
19 19 engine_A 2018-09-08 13:20:22 0.56042455 0.917456737 1
20 20 engine_A 2018-09-10 14:09:11 0.28146879 -0.031325502 0
21 21 engine_A 2018-09-10 18:13:15 0.78628120 0.467097310 0
22 22 engine_A 2018-09-10 22:17:19 0.17301935 1.024197674 0
23 23 engine_A 2018-09-11 02:21:23 0.57074752 0.267358452 1
24 24 engine_A 2018-09-11 06:25:27 0.41928296 0.231826103 1
25 25 engine_A 2018-09-11 10:29:32 0.26762217 0.747592465 1
26 26 engine_A 2018-09-11 14:33:36 0.04780944 1.217068511 1
27 27 engine_A 2018-09-11 18:37:40 0.10349305 0.383358345 1
28 28 engine_A 2018-09-11 22:41:44 0.31403146 -0.988052822 1
29 29 engine_A 2018-09-12 02:45:48 0.80064106 -0.156852910 2
30 30 engine_A 2018-09-12 06:49:52 0.22932470 1.735535216 2
31 31 engine_A 2018-09-12 10:53:56 0.21299844 -0.352298306 2
32 32 engine_A 2018-09-12 14:58:00 0.87710091 0.688640044 2
33 33 engine_A 2018-09-12 19:02:04 0.99322196 1.224406096 2
34 34 engine_A 2018-09-12 23:06:08 0.84424702 0.794296303 2
35 35 engine_A 2018-09-13 03:10:12 0.91043655 -0.006402398 3
36 36 engine_A 2018-09-13 07:14:16 0.47126973 0.219150635 3
37 37 engine_A 2018-09-13 11:18:20 0.22441841 -0.886463751 3
38 38 engine_A 2018-09-13 15:22:24 0.12781466 0.439760291 3
39 39 engine_A 2018-09-13 19:26:29 0.27968351 -0.886389751 3
一般情况下,我只有dataframe my_df
,需要生成column run_days
,当然不是人工检查。我怎样才能做到这一点?
解决方案
这是另一个不使用循环但更有效的tidyverse
功能的答案:
library(tidyverse)
offThreshold <- 1
df <- my_df %>%
mutate(off = (date_time - lag(date_time)) / ddays(1) > offThreshold, # lag() means previous record
timediff = if_else(off, 0, (date_time - lag(date_time)) / ddays(1)),
timediff = if_else(is.na(timediff), 0, timediff))
dat <- df %>%
filter(off == TRUE | is.na(off)) %>% # select signals that indicate stopping
select(engine, date_time) %>%
mutate(runNo = row_number(date_time)) %>% # number the times of stopping
{left_join(df, ., by = c("engine",
"date_time"))} %>% # add the runNo to the original data
fill(runNo, .direction = "down") %>% # repopulate runNo to subsequent rows
group_by(engine, runNo) %>% # create a separate gp. for each machine/run combination
mutate(run_days = round(cumsum(timediff))) %>% # compute run time for each gp.
ungroup() %>%
select(-off, -timediff, -runNo)
head(dat, 15)
这是最终结果
# A tibble: 15 x 5
engine date_time x y run_days
<fct> <dttm> <dbl> <dbl> <dbl>
1 engine_A 2018-09-03 19:26:29 0.168 0.901 0
2 engine_A 2018-09-03 23:30:33 0.808 0.852 0
3 engine_A 2018-09-04 03:34:37 0.385 0.728 0
4 engine_A 2018-09-04 07:38:41 0.328 0.737 1
5 engine_A 2018-09-04 11:42:45 0.602 -0.352 1
6 engine_A 2018-09-04 19:50:53 0.125 1.30 1
7 engine_A 2018-09-04 23:54:57 0.295 0.0383 1
8 engine_A 2018-09-05 03:59:01 0.578 -0.979 1
9 engine_A 2018-09-05 12:07:09 0.512 0.787 2
10 engine_A 2018-09-05 20:15:17 0.534 1.70 2
11 engine_A 2018-09-06 00:19:21 0.557 -0.795 2
12 engine_A 2018-09-06 08:27:30 0.830 -2.27 3
13 engine_A 2018-09-06 12:31:34 0.111 -0.162 3
14 engine_A 2018-09-07 17:00:02 0.0934 -1.74 0
15 engine_A 2018-09-07 21:04:06 0.237 -1.41 0