r - 是否有更好更快的方法来计算存储在变量中的一些统计数据并将结果分配给也存储在变量中的新列?
问题描述
我想知道是否有更快的方法来计算存储在变量中的一些统计数据并将结果分配给也存储在变量中的新列?我创建了一个名为“count”的函数,因为我只想计算有效数字(非缺失)。
在此示例中,变量stc会有所不同,它可以包含单个统计信息和多个统计信息,即:
stc = c("mean)
stc = c("mean", "sd")
stc = c("max", "min", "count)"
library("data.table")
library("glue")
tbl <- data.table(ID = paste0("ID", 1:9),
TR = rnorm(9),
GR = c("A", "A", "B", "C", "A", "C", "A", "B", "B"))
trt <- "New.Name"
rhs <- "TR"
blk <- "GR"
stc <- c("Mean", "Count")
count <- function(x, na.rm = TRUE) {
sum(!is.na(x), na.rm = na.rm)
}
lhr_expr <- glue(paste0("c('", paste(trt, stc, sep = '.', collapse = "', '"), "')"))
rhs_expr <- glue("list({glue_collapse(glue('{tolower(stc)}({rhs}, na.rm = TRUE)'), sep = ', ')})")
tbl[, eval(rlang::parse_expr(glue("({lhr_expr}) := {rhs_expr}"))), by = blk]
结果将是:
ID TR GR New.Name.Mean New.Name.Count
1: ID1 2.9521189 A 0.6473405 4
2: ID2 -0.5643511 A 0.6473405 4
3: ID3 0.5951386 B 0.2703137 3
4: ID4 0.4977452 C 0.2890320 2
5: ID5 -0.9350095 A 0.6473405 4
6: ID6 0.0803188 C 0.2890320 2
7: ID7 1.1366038 A 0.6473405 4
8: ID8 -0.2496585 B 0.2703137 3
9: ID9 0.4654609 B 0.2703137 3
更新:
我已经更新了代码并添加了到目前为止的所有建议以执行基准测试。
fun1
是罗兰建议的
fun3
并由Colefun4
建议
fun5
由MrSmithGoesToWashington建议
library("data.table")
library("benchr")
library("glue")
n <- 1000000
p_miss <- 0.05
dat <- data.table(ID = paste0("ID", 1:n),
TR = sample(c(rnorm((1 - p_miss)*n), rep(NA_real_, p_miss*n))),
GR = sample(paste0("GR", 1:500), n, replace = TRUE))
fun0 <- function(tbl, trt, rhs, blk, stc) {
tbl <- copy(tbl)
count <- function(x, na.rm = TRUE) {
sum(!is.na(x), na.rm = na.rm)
}
lhr_expr <- glue(paste0("c('", paste(trt, stc, sep = '.', collapse = "', '"), "')"))
rhs_expr <- glue(".({glue_collapse(glue('{tolower(stc)}({rhs}, na.rm = TRUE)'), sep = ', ')})")
tbl[, eval(rlang::parse_expr(glue("({lhr_expr}) := {rhs_expr}"))), by = blk]
}
fun1 <- function(tbl, trt, rhs, blk, stc) {
tbl <- copy(tbl)
count <- function(x, na.rm = TRUE) {
sum(!is.na(x), na.rm = na.rm)
}
for (fun in stc) {
FUN <- as.name(tolower(fun))
RHS <- as.name(rhs)
eval(bquote(
tbl[, paste(trt, fun, sep = ".") := .(FUN)(.(RHS), na.rm = TRUE), by = blk, verbose = FALSE][]
))
}
return(tbl)
}
fun2 <- function(tbl, trt, rhs, blk, stc) {
tbl <- copy(tbl)
count <- function(x, na.rm = TRUE) {
sum(!is.na(x), na.rm = na.rm)
}
dt_expr <- paste0(glue("{trt}.{stc}"), " = ", glue('{tolower(stc)}({rhs}, na.rm = TRUE)'), collapse = ', ')
tbl_blk <- tbl[, eval(rlang::parse_expr(glue(".({dt_expr})"))), by = blk]
tbl <- merge(tbl, tbl_blk, by = blk, all.x = TRUE)
return(tbl)
}
fun3 <- function(tbl, trt, rhs, blk, stc) {
count <- function(x, na.rm = TRUE) {
sum(!is.na(x), na.rm = na.rm)
}
tbl <- copy(tbl)
NSE_expr <- substitute(tbl[, (paste(trt, stc, sep = ".")) := my_call, blk],
list(my_call = as.call(c(quote(list), lapply(stc, function(x) as.call(list(str2lang(tolower(x)), str2lang(rhs), na.rm = TRUE)))))))
tbl <- eval(NSE_expr)[]
return(tbl)
}
fun4 <- function(tbl, trt, rhs, blk, stc) {
tbl <- copy(tbl)
count <- function(x, na.rm = TRUE) {
sum(!is.na(x), na.rm = na.rm)
}
tbl <- tbl[, (paste(trt, stc, sep = ".")) := lapply(stc, function(x) eval(call(tolower(x), str2lang(rhs), na.rm = TRUE))), blk]
return(tbl)
}
fun5 <- function(tbl, trt, rhs, blk, stc) {
tbl <- copy(tbl)
count <- function(x, na.rm = TRUE) sum(!is.na(x))
lhr_expr <- glue(paste0("c('", paste(trt, stc, sep = '.', collapse = "', '"), "')"))
rhs_expr <- glue(".({glue_collapse(glue('{tolower(stc)}({rhs}, na.rm = TRUE)'), sep = ', ')})")
tbl <- eval(rlang::parse_expr(paste0("tbl[, ",lhr_expr," := ",rhs_expr, ", by = ", blk, "]")))
return(tbl)
}
基准
很高兴看到优化此类代码的不同方法,即使只有几毫秒。
res <- benchr::benchmark(
tbl0 = fun0(tbl = dat, trt = "New.Name", rhs = "TR", blk = "GR", stc = c("Mean", "Count", "Min", "Max", "Var", "SD")),
tbl1 = fun1(tbl = dat, trt = "New.Name", rhs = "TR", blk = "GR", stc = c("Mean", "Count", "Min", "Max", "Var", "SD")),
tbl2 = fun2(tbl = dat, trt = "New.Name", rhs = "TR", blk = "GR", stc = c("Mean", "Count", "Min", "Max", "Var", "SD")),
tbl3 = fun3(tbl = dat, trt = "New.Name", rhs = "TR", blk = "GR", stc = c("Mean", "Count", "Min", "Max", "Var", "SD")),
tbl4 = fun4(tbl = dat, trt = "New.Name", rhs = "TR", blk = "GR", stc = c("Mean", "Count", "Min", "Max", "Var", "SD")),
tbl5 = fun5(tbl = dat, trt = "New.Name", rhs = "TR", blk = "GR", stc = c("Mean", "Count", "Min", "Max", "Var", "SD"))
)
print(res, order = "median")
Benchmark summary:
Time units : milliseconds
expr n.eval min lw.qu median mean up.qu max total relative
tbl0 100 168 199 221 235 250 747 23500 1.00
tbl3 100 168 195 230 238 269 557 23800 1.04
tbl5 100 174 206 232 239 264 515 23900 1.05
tbl2 100 170 216 245 267 281 768 26700 1.11
tbl1 100 182 240 268 291 319 601 29100 1.22
tbl4 100 423 480 524 571 624 969 57100 2.37
不使用copy(tbl)
,和median
是一样fun0
的。fun3
fun5
Benchmark summary:
Time units : milliseconds
expr n.eval min lw.qu median mean up.qu max total relative
tbl3 100 147 152 158 167 168 276 16700 1.00
tbl0 100 148 153 158 169 165 478 16900 1.00
tbl5 100 149 152 158 165 166 242 16500 1.00
tbl1 100 169 180 185 200 196 504 20000 1.17
tbl2 100 199 238 261 276 287 640 27600 1.65
tbl4 100 424 442 463 488 500 758 48800 2.93
似乎fun5
重复之间的结果更稳定。
谢谢你。
解决方案
这至少对我的系统上的平均值计算使用了一些 data.table 优化:
for (fun in stc) {
FUN <- as.name(tolower(fun))
RHS <- as.name(rhs)
eval(bquote(
tbl[, paste(trt, fun, sep = ".") :=
.(FUN)(.(RHS)),
by = blk, verbose = TRUE][]
))
}
无法测试性能,因为您没有提供合适且具有代表性的示例。所以,请对自己进行基准测试。
(请注意,如果您在 data.table 中.()
用作快捷方式,则需要进行调整。您可以使用 then代替,甚至使用胶水的东西。)list
substitute
bquote
推荐阅读
- sql - 似乎无法获得在 informix 上工作的外部联接
- javascript - 为什么表单数据在反应中没有保存到本地存储?
- javascript - 无法脱离 Visual Studio 2019 沙箱访问局域网
- java - 通过使用循环访问每个索引将字符串数组转换为整数数组
- python - 如何在 Python 中使用 Pandas DF 值作为字符串,以便我可以在 Selenium 中发送具有从 Pandas DF 中提取的确切值的密钥?
- javascript - 寻找匹配项目的理想组合
- javascript - 带有 Babel 和 Webpack 的 AngularJS 不加载组件控制器
- python - 验证棉花糖中的列表
- ios - 如何使用 Apple Enterprise 帐户更新用户 iOS 应用程序
- javascript - 如何在 javascript 中访问我在模型中拥有的列表的属性?