r - 在R中的汇总预测表中按组加入残差
问题描述
可重现的例子
df=structure(list(group = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L), year = c(1973L, 1974L, 1975L, 1976L, 1977L, 1978L,
1973L, 1974L, 1975L, 1976L, 1977L, 1978L), Jan = c(9007L, 7750L,
8162L, 7717L, 7792L, 7836L, 9007L, 7750L, 8162L, 7717L, 7792L,
7836L), Feb = c(8106L, 6981L, 7306L, 7461L, 6957L, 6892L, 8106L,
6981L, 7306L, 7461L, 6957L, 6892L), Mar = c(8928L, 8038L, 8124L,
7767L, 7726L, 7791L, 8928L, 8038L, 8124L, 7767L, 7726L, 7791L
), Apr = c(9137L, 8422L, 7870L, 7925L, 8106L, 8192L, 9137L, 8422L,
7870L, 7925L, 8106L, 8192L), May = c(10017L, 8714L, 9387L, 8623L,
8890L, 9115L, 10017L, 8714L, 9387L, 8623L, 8890L, 9115L), Jun = c(10826L,
9512L, 9556L, 8945L, 9299L, 9434L, 10826L, 9512L, 9556L, 8945L,
9299L, 9434L), Jul = c(11317L, 10120L, 10093L, 10078L, 10625L,
10484L, 11317L, 10120L, 10093L, 10078L, 10625L, 10484L), Aug = c(10744L,
9823L, 9620L, 9179L, 9302L, 9827L, 10744L, 9823L, 9620L, 9179L,
9302L, 9827L), Sep = c(9713L, 8743L, 8285L, 8037L, 8314L, 9110L,
9713L, 8743L, 8285L, 8037L, 8314L, 9110L), Oct = c(9938L, 9129L,
8466L, 8488L, 8850L, 9070L, 9938L, 9129L, 8466L, 8488L, 8850L,
9070L), Nov = c(9161L, 8710L, 8160L, 7874L, 8265L, 8633L, 9161L,
8710L, 8160L, 7874L, 8265L, 8633L), Dec = c(8927L, 8680L, 8034L,
8647L, 8796L, 9240L, 8927L, 8680L, 8034L, 8647L, 8796L, 9240L
)), .Names = c("group", "year", "Jan", "Feb", "Mar", "Apr", "May",
"Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"), class = "data.frame", row.names = c(NA,
-12L))
按组执行 Forecat
library(forecast)
ld <- split(df[, -1], df$group)
ld <- lapply(ld, function(x) {ts(c(t(x[,-1])), start = min(x[,1]), frequency = 12)})
lts <- lapply(ld, ets, model = "ZZZ")
所以结果
$`1`
Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
Jan 1979 8397.497 8022.399 8772.595 7823.834 8971.160
Feb 1979 7599.221 7162.825 8035.616 6931.812 8266.630
Mar 1979 8396.595 7906.510 8886.679 7647.075 9146.115
Apr 1979 8646.510 8108.063 9184.957 7823.026 9469.994
从 1979 年开始,它是预测值,我想得到 1973-1978 的残差结果。(初始值)
res <- lapply(lts, residuals)
和结果
$`1`
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov
1973 497.69233 99.50607 64.44947 -15.20925 77.85009 390.89045 -277.67369 26.92614 72.42590 -85.69894 -338.10035
等等
- 问题:对于 1979 年及以上,我们看到预测值,但对于 1973-1978 年的列点预测,我们看到残差。理想情况下,当然,得到的不是那么多残差,而是原始值和预测值。所以我不知道如何将初始数据 1973-1978 加入汇总表原始值这样
df[df$year == 1973,]
的东西,但是全年如何......然后从原始值中减去残差并得到预测值(也许我使任务复杂化了很多,但是否则我不知道如何获得所需的输出)
colnames point forecast
,lo80
并且hi80
不需要更改,我会记住,对于初始值,它们意味着残差、原始和预测。
是否可以使用 dplyr 或 data.table 解决方案来做到这一点?
# Tidy-up the splits
ld <- lapply(ld, function(x) {
x %>%
gather(key, value, -year) %>%
unite(date, year, key, sep = "-") %>%
mutate(date = paste0(date, "-01")) %>%
mutate(date = as.Date(date, format = "%Y-%b-%d"))
})
结果
$`1`
date value
1 <NA> 9007
2 <NA> 7750
3 <NA> 8162
4 <NA> 7717
5 <NA> 7792
6 <NA> 7836
7 <NA> 8106
8 <NA> 6981
9 <NA> 7306
10 <NA> 7461
11 <NA> 6957
12 <NA> 6892
ld=dput()
ld <- lapply(ld, function(x) {
yr <- lubridate::year(min(x$date))
mth <- lubridate::month(min(x$date))
timetk::tk_ts(data = x, select = value, frequency = 12,
start = c(yr, mth))
})
错误
Error in x$date : $ operator is invalid for atomic vectors
编辑3
> lts_all <- lapply(names(lts), function(x, lts) {
+ output_fit <- lts[[x]][["res_fit_tbl"]] %>%
+ mutate(group = x)
+ output_fcst <- lts[[x]][["res_fcst_tbl"]] %>%
+ mutate(group = x)
+
+ return(list(output_fit=output_fit, output_fcst=output_fcst))
+ }, lts)
> lts_all
[[1]]
[[1]]$output_fit
# A tibble: 72 x 6
date value residuals CI95_upper CI95_lower group
<date> <dbl> <dbl> <dbl> <dbl> <chr>
1 1973-01-01 8509 498 9083 7936 value
2 1973-02-01 8006 99.5 8580 7433 value
3 1973-03-01 8864 64.4 9437 8290 value
4 1973-04-01 9152 - 15.2 9726 8579 value
5 1973-05-01 9939 77.9 10513 9365 value
6 1973-06-01 10435 391 11009 9861 value
7 1973-07-01 11595 -278 12168 11021 value
8 1973-08-01 10717 26.9 11291 10143 value
9 1973-09-01 9641 72.4 10214 9067 value
10 1973-10-01 10024 - 85.7 10597 9450 value
# ... with 62 more rows
解决方案
这是一个完整的解决方案df
,从可重现的示例开始:
# load libraries
load_pkgs <- c("forecast", "zoo", "timetk", "tidyverse")
sapply(load_pkgs, function(x) suppressPackageStartupMessages(library(x, character.only = T)))
第 1 步:预处理
# perform split by group
ld <- split(df[, -1], df$group)
# Tidy-up the splits
ld <- lapply(ld, function(x) {
x %>%
gather(key, value, -year) %>%
unite(date, year, key, sep = "-") %>%
mutate(date = paste0(date, "-01")) %>%
mutate(date = as.Date(date, format = "%Y-%b-%d"))
})
dput
第一个ts:
structure(list(date = structure(c(1096, 1461, 1826, 2191, 2557,
2922, 1127, 1492, 1857, 2222, 2588, 2953, 1155, 1520, 1885, 2251,
2616, 2981, 1186, 1551, 1916, 2282, 2647, 3012, 1216, 1581, 1946,
2312, 2677, 3042, 1247, 1612, 1977, 2343, 2708, 3073, 1277, 1642,
2007, 2373, 2738, 3103, 1308, 1673, 2038, 2404, 2769, 3134, 1339,
1704, 2069, 2435, 2800, 3165, 1369, 1734, 2099, 2465, 2830, 3195,
1400, 1765, 2130, 2496, 2861, 3226, 1430, 1795, 2160, 2526, 2891,
3256), class = "Date"), value = c(9007L, 7750L, 8162L, 7717L,
7792L, 7836L, 8106L, 6981L, 7306L, 7461L, 6957L, 6892L, 8928L,
8038L, 8124L, 7767L, 7726L, 7791L, 9137L, 8422L, 7870L, 7925L,
8106L, 8192L, 10017L, 8714L, 9387L, 8623L, 8890L, 9115L, 10826L,
9512L, 9556L, 8945L, 9299L, 9434L, 11317L, 10120L, 10093L, 10078L,
10625L, 10484L, 10744L, 9823L, 9620L, 9179L, 9302L, 9827L, 9713L,
8743L, 8285L, 8037L, 8314L, 9110L, 9938L, 9129L, 8466L, 8488L,
8850L, 9070L, 9161L, 8710L, 8160L, 7874L, 8265L, 8633L, 8927L,
8680L, 8034L, 8647L, 8796L, 9240L)), class = "data.frame", row.names = c(NA,
-72L))
然后
# Transform time series to ts objects
ld <- lapply(ld, function(x) {
yr <- lubridate::year(min(x$date))
mth <- lubridate::month(min(x$date))
timetk::tk_ts(data = x, select = value, frequency = 12,
start = c(yr, mth))
})
dput
第一个ts:
structure(c(9007L, 8106L, 8928L, 9137L, 10017L, 10826L, 11317L,
10744L, 9713L, 9938L, 9161L, 8927L, 7750L, 6981L, 8038L, 8422L,
8714L, 9512L, 10120L, 9823L, 8743L, 9129L, 8710L, 8680L, 8162L,
7306L, 8124L, 7870L, 9387L, 9556L, 10093L, 9620L, 8285L, 8466L,
8160L, 8034L, 7717L, 7461L, 7767L, 7925L, 8623L, 8945L, 10078L,
9179L, 8037L, 8488L, 7874L, 8647L, 7792L, 6957L, 7726L, 8106L,
8890L, 9299L, 10625L, 9302L, 8314L, 8850L, 8265L, 8796L, 7836L,
6892L, 7791L, 8192L, 9115L, 9434L, 10484L, 9827L, 9110L, 9070L,
8633L, 9240L), .Dim = c(72L, 1L), .Dimnames = list(NULL, "value"), index = structure(c(94694400,
97372800, 99792000, 102470400, 105062400, 107740800, 110332800,
113011200, 115689600, 118281600, 120960000, 123552000, 126230400,
128908800, 131328000, 134006400, 136598400, 139276800, 141868800,
144547200, 147225600, 149817600, 152496000, 155088000, 157766400,
160444800, 162864000, 165542400, 168134400, 170812800, 173404800,
176083200, 178761600, 181353600, 184032000, 186624000, 189302400,
191980800, 194486400, 197164800, 199756800, 202435200, 205027200,
207705600, 210384000, 212976000, 215654400, 218246400, 220924800,
223603200, 226022400, 228700800, 231292800, 233971200, 236563200,
239241600, 241920000, 244512000, 247190400, 249782400, 252460800,
255139200, 257558400, 260236800, 262828800, 265507200, 268099200,
270777600, 273456000, 276048000, 278726400, 281318400), tzone = "UTC", tclass = "Date"), .indexCLASS = "Date", tclass = "Date", .indexTZ = "UTC", tzone = "UTC", class = "ts", .Tsp = c(1973,
1978.91666666667, 12))
第 2 步:训练和预测ets
您需要一个帮助功能将您的输出转换为数据框:
# helping function
make_df <- function(ts_obj) {
ts_df <- timetk::tk_tbl(preserve_index = TRUE, ts_obj) %>%
mutate(index = zoo::as.Date(x = .$index, frac = 0)) %>%
dplyr::rename(date = index)
return(ts_df)
}
以下函数训练ets
和预测未来 12 个月;然后,它准备带有拟合值和预测值的表格:
lts <- lapply(ld, function(ts_obj) {
# train ets model and get fitted results
res_model <- ets(ts_obj, model = "ZZZ")
res_fit <- ts(as.numeric(res_model$fitted), start = start(ts_obj), frequency = 12)
# add extra metrics you may be interested in
model <- res_model[["method"]]
mse <- res_model[["mse"]]
# get forecasts for the next 12 months
res_fct <- forecast(res_model, h = 12)
res_fcst <- ts(res_fct$mean, start = end(ts_obj) + 1/12, frequency = 12)
# transform results to tbl
# for fitted output we keep the residuals and the 95% CI
res_fit_tbl <- make_df(res_fit) %>%
mutate(residuals = as.numeric(res_model[["residuals"]])) %>%
mutate(CI95_upper = value + 1.96*sqrt(res_model$sigma2),
CI95_lower = value - 1.96*sqrt(res_model$sigma2))
# the forecast output does not have residuals
res_fcst_tbl <- make_df(res_fcst)
return(list(res_fit_tbl = res_fit_tbl, res_fcst_tbl = res_fcst_tbl, model = model, mse = mse)) # don't forget to pass the extra metrics as output
})
第 3 步:将不同组的拟合输出和预测输出汇总在一起
# add groups back + other metrics of interest
lts_all <- lapply(names(lts), function(x, lts) {
output_fit <- lts[[x]][["res_fit_tbl"]] %>%
mutate(group = x,
model = lts[[x]][["model"]],
mse = lts[[x]][["mse"]])
output_fcst <- lts[[x]][["res_fcst_tbl"]] %>%
mutate(group = x)
return(list(output_fit=output_fit, output_fcst=output_fcst))
}, lts)
样本输出:
> lts_all[[1]][["output_fit"]]
# A tibble: 72 x 6
date value residuals CI95_upper CI95_lower group
<date> <dbl> <dbl> <dbl> <dbl> <chr>
1 1973-01-01 8509. 498. 9083. 7936. 1
2 1973-02-01 8006. 99.5 8580. 7433. 1
3 1973-03-01 8864. 64.4 9437. 8290. 1
4 1973-04-01 9152. -15.2 9726. 8579. 1
5 1973-05-01 9939. 77.9 10513. 9365. 1
6 1973-06-01 10435. 391. 11009. 9861. 1
7 1973-07-01 11595. -278. 12168. 11021. 1
8 1973-08-01 10717. 26.9 11291. 10143. 1
9 1973-09-01 9641. 72.4 10214. 9067. 1
10 1973-10-01 10024. -85.7 10597. 9450. 1
# ... with 62 more rows
> lts_all[[2]][["output_fit"]]
# A tibble: 72 x 6
date value residuals CI95_upper CI95_lower group
<date> <dbl> <dbl> <dbl> <dbl> <chr>
1 1973-01-01 8509. 498. 9083. 7936. 2
2 1973-02-01 8006. 99.5 8580. 7433. 2
3 1973-03-01 8864. 64.4 9437. 8290. 2
4 1973-04-01 9152. -15.2 9726. 8579. 2
5 1973-05-01 9939. 77.9 10513. 9365. 2
6 1973-06-01 10435. 391. 11009. 9861. 2
7 1973-07-01 11595. -278. 12168. 11021. 2
8 1973-08-01 10717. 26.9 11291. 10143. 2
9 1973-09-01 9641. 72.4 10214. 9067. 2
10 1973-10-01 10024. -85.7 10597. 9450. 2
# ... with 62 more rows
然后
# bring together the fitted respectively forecasting results
output_fit_all <- lapply(lts_all, function(x) x[[1]])
output_fit_all <- bind_rows(output_fit_all)
output_fcst_all <- lapply(lts_all, function(x) x[[2]])
output_fcst_all <- bind_rows(output_fcst_all)
推荐阅读
- python - 将向量从 C++ 返回到 Python,而不使用 SWIG 进行复制
- python-3.x - Python:使用子字符串对列表进行排序
- selenium - 通过 Selenium 启动 ChromeDriver 时,控制台中显示的字母数字字符串代表什么?
- python - 将具有多个标签的值写入 influxDB(使用 python)
- python - 遍历高光谱图像中的行
- android - 视图中的协程作业在哪里取消?
- c++ - 将枚举数组转换为参数包
- r - 如何使用 par() 在 R 中将四个 ggpar 对象合并为一个?
- r - 将每个日期具有一个值的数据框与每个日期具有多个条目的数据框结合起来
- sql - 在这种情况下与 <> 不同吗?