r - 如何减小回归对象的大小适合保存到.rds时大小爆炸的函数
问题描述
我正在使用 rstanarm 在函数中拟合 stan_glm 模型。遇到一个问题,即保存的 stanfit 对象的大小在保存到 .rds 时会爆炸,但仅当模型适合函数时。问题似乎是 stanfit 对象正在存储本地环境的副本,然后使用 write_rds 将其保存到磁盘?手动删除函数内的大对象或多或少地解决了这个问题,但这是一个相当笨拙的解决方案,所以想知道是否有人建议以更优雅的方式解决这个问题?下面的玩具代表(警告这会将一些 .rds 文件写入磁盘,我在示例结束时将它们删除但请注意)
library(readr)
library(rstanarm)
#> Loading required package: Rcpp
#> rstanarm (Version 2.19.3, packaged: 2020-02-11 05:16:41 UTC)
#> - Do not expect the default priors to remain the same in future rstanarm versions.
#> Thus, R scripts should specify priors explicitly, even if they are just the defaults.
#> - For execution on a local, multicore CPU with excess RAM we recommend calling
#> options(mc.cores = parallel::detectCores())
#> - bayesplot theme set to bayesplot::theme_default()
#> * Does _not_ affect other ggplot2 plots
#> * See ?bayesplot_theme_set for details on theme setting
library(gapminder)
# create a largeish object
test <- matrix(data = rnorm(10000), nrow = 10000/2, ncol = 10000/2)
# fit model in the global environment
a = stan_glm(lifeExp ~ gdpPercap, data = gapminder, refresh =0)
print(object.size(a), unit = "Mb")
#> 1.4 Mb
# fit model inside function , passing but not using largeish object
memfoo <- function(gap, testy, clean = FALSE){
d <- testy
if (clean){
rm(d,testy)
}
a <- stan_glm(lifeExp ~ gdpPercap, data = gap, refresh = 0)
}
b <- memfoo(gapminder, test)
# fit model again, but removing large obects from the environment before running
d <- memfoo(gapminder, test, clean = TRUE)
print(object.size(a), unit = "Mb")
#> 1.4 Mb
print(object.size(b), unit = "Mb")
#> 1.4 Mb
print(object.size(d), unit = "Mb")
#> 1.4 Mb
# all same size in memory
# write to .rds
write_rds(a,"a.rds")
write_rds(b,"b.rds")
write_rds(d,"d.rds")
# rstan object run in function with largeish object in environment is 45 times bigger than same regression
# fit outside function!
file.size("a.rds")
#> [1] 9026456
file.size("b.rds")
#> [1] 410011317
file.size("d.rds")
#> [1] 10011197
file.size("b.rds") / file.size("a.rds")
#> [1] 45.42329
file.remove(c("a.rds", "b.rds", "d.rds"))
#> [1] TRUE TRUE TRUE
由reprex 包于 2020-03-11 创建(v0.3.0)
解决方案
结果与此处发布的解决方案相同,为函数内部的回归指定一个新环境就可以了。
library(tidyverse)
library(rstanarm)
#> Loading required package: Rcpp
#> rstanarm (Version 2.19.3, packaged: 2020-02-11 05:16:41 UTC)
#> - Do not expect the default priors to remain the same in future rstanarm versions.
#> Thus, R scripts should specify priors explicitly, even if they are just the defaults.
#> - For execution on a local, multicore CPU with excess RAM we recommend calling
#> options(mc.cores = parallel::detectCores())
#> - bayesplot theme set to bayesplot::theme_default()
#> * Does _not_ affect other ggplot2 plots
#> * See ?bayesplot_theme_set for details on theme setting
library(gapminder)
library(butcher)
# create a largeish object
test <- matrix(data = rnorm(10000), nrow = 10000/2, ncol = 10000/2)
# fit model in the global environment
a = stan_glm(lifeExp ~ gdpPercap, data = gapminder, refresh =0)
# fit model inside function , passing but not using largeish object
memfoo <- function(gap, testy, clean = FALSE){
d <- testy
if (clean){
env <- new.env(parent = .GlobalEnv)
env$gap <- gap
a <- with(env,{stan_glm(lifeExp ~ gdpPercap, data = gap, refresh = 0)})
} else {
a <- stan_glm(lifeExp ~ gdpPercap, data = gap, refresh = 0)
}
}
b <- memfoo(gapminder, test)
d <- memfoo(gapminder, test, clean = TRUE)
butcher::weigh(a)
#> # A tibble: 50 x 2
#> object size
#> <chr> <dbl>
#> 1 stanfit 3.87
#> 2 residuals 0.123
#> 3 data.country 0.017
#> 4 fitted.values 0.0143
#> 5 linear.predictors 0.0143
#> 6 y 0.0143
#> 7 model.lifeExp 0.0137
#> 8 model.gdpPercap 0.0137
#> 9 data.lifeExp 0.0137
#> 10 data.gdpPercap 0.0137
#> # … with 40 more rows
butcher::weigh(b)
#> # A tibble: 50 x 2
#> object size
#> <chr> <dbl>
#> 1 terms 204.
#> 2 formula 204.
#> 3 stanfit 3.87
#> 4 residuals 0.123
#> 5 data.country 0.017
#> 6 fitted.values 0.0143
#> 7 linear.predictors 0.0143
#> 8 y 0.0143
#> 9 model.lifeExp 0.0137
#> 10 model.gdpPercap 0.0137
#> # … with 40 more rows
butcher::weigh(d)
#> # A tibble: 50 x 2
#> object size
#> <chr> <dbl>
#> 1 stanfit 3.87
#> 2 residuals 0.123
#> 3 terms 0.0700
#> 4 formula 0.0677
#> 5 data.country 0.017
#> 6 fitted.values 0.0143
#> 7 linear.predictors 0.0143
#> 8 y 0.0143
#> 9 model.lifeExp 0.0137
#> 10 model.gdpPercap 0.0137
#> # … with 40 more rows
write_rds(a, path = "a.rds")
write_rds(b, path = "b.rds")
write_rds(d, path = "d.rds")
file.size("a.rds")
#> [1] 9110178
file.size("b.rds")
#> [1] 410171588
file.size("d.rds")
#> [1] 9167588
file.size("d.rds") / file.size("a.rds")
#> [1] 1.006302
file.remove(c("a.rds", "b.rds", "d.rds"))
#> [1] TRUE TRUE TRUE
由reprex 包于 2020-03-12 创建(v0.3.0)
推荐阅读
- c++ - 简单的 Cairo/Quartz C++ 示例
- android - WorkManager 在不同应用中的 uniqueWorkName
- c++ - Clion 的调试器正在运行程序,但常规控制台没有。
- algorithm - 约翰逊算法的负边缘 - 距离矩阵
- python - Pandas - 将 excel 数据框转换为特定的字典格式
- python - Pandas:对多列求和,但如果该行中的任何列是 NaN 或 0,则写 NaN
- hyperledger-fabric - 订货人如何选择组织?
- graphics - 使相机跟随 GODOT 中的对象的脚本是什么?
- python - 使用追加添加到字典内列表中的值
- xamarin - 如何在 Akavache 中订阅新价值?