首页 > 解决方案 > R data.table 逐行操作的首选性能过程?

问题描述

以下代码是否代表了遍历 R 的行data.table并将在每行中找到的值传递给函数的首选过程?还是有更高效的方法来做到这一点?

library(data.table)
set.seed(2)
n <- 100
b <- c(0.5, 1.5, -1)
phi <- 0.8
X <- cbind(1, matrix(rnorm(n*2, 0, 1), ncol = 2))
y <- X %*% matrix(b, ncol = 1) + rnorm(n, 0, phi)
d <- data.table(y, X)
setnames(d, c("y", "x0", "x1", "x2"))

logpost <- function(d, b1, b2, b3, phi, mub = 1, taub = 10, a = 0.5, z = 0.7){
    N <- nrow(d)
    mu <- b1 + b2 * d$x1 + b3 * d$x2
    lp <- -N * log(phi) -
        (1/(2*phi^2)) * sum( (d$y-mu)^2  ) -
        (1/(2*taub^2))*( (b1-mub)^2 + (b2-mub)^2 + (b3-mub)^2 ) -
        (a+1)*log(phi) - (z/phi)
    lp
}

nn <- 21
grid <- data.table(
expand.grid(b1 = seq(0, 1, len = nn),
    b2 = seq(1, 2, len = nn),
    b3 = seq(-1.5, -0.5, len = nn),
    phi = seq(0.4, 1.2, len = nn)))
grid[, id := 1:.N]
setkey(grid, id)

wraplogpost <- function(dd){
    logpost(d, dd$b1, dd$b2, dd$b3, dd$phi)
}
start <- Sys.time()
grid[, lp := wraplogpost(.SD), by = seq_len(nrow(grid))]
difftime(Sys.time(), start)
# Time difference of 2.081544 secs

编辑:显示前几条记录

> head(grid)
b1 b2   b3 phi id        lp
1: 0.00  1 -1.5 0.4  1 -398.7618
2: 0.05  1 -1.5 0.4  2 -380.3674
3: 0.10  1 -1.5 0.4  3 -363.5356
4: 0.15  1 -1.5 0.4  4 -348.2663
5: 0.20  1 -1.5 0.4  5 -334.5595
6: 0.25  1 -1.5 0.4  6 -322.4152

我尝试过使用set,但这种方法似乎逊色

start <- Sys.time()
grid[, lp := NA_real_]
for(i in 1:nrow(grid)){
    llpp <- wraplogpost(grid[i])
    set(grid, i, "lp", llpp)
}
difftime(Sys.time(), start)
# Time difference of 21.71291 secs

编辑:显示前几条记录

> head(grid)
b1 b2   b3 phi id        lp
1: 0.00  1 -1.5 0.4  1 -398.7618
2: 0.05  1 -1.5 0.4  2 -380.3674
3: 0.10  1 -1.5 0.4  3 -363.5356
4: 0.15  1 -1.5 0.4  4 -348.2663
5: 0.20  1 -1.5 0.4  5 -334.5595
6: 0.25  1 -1.5 0.4  6 -322.4152

对相关文档的建议或指针将不胜感激。

编辑:根据评论:

start <- Sys.time()
grid[, lp := wraplogpost(.SD), by = .I]
difftime(Sys.time(), start)
Warning messages:
1: In b2 * d$x1 :
    longer object length is not a multiple of shorter object length
2: In b3 * d$x2 :
    longer object length is not a multiple of shorter object length
3: In d$y - mu :
    longer object length is not a multiple of shorter object length
> difftime(Sys.time(), start)
Time difference of 0.01199317 secs
> 
> head(grid)
b1 b2   b3 phi id        lp
1: 0.00  1 -1.5 0.4  1 -620977.2
2: 0.05  1 -1.5 0.4  2 -620977.2
3: 0.10  1 -1.5 0.4  3 -620977.2
4: 0.15  1 -1.5 0.4  4 -620977.2
5: 0.20  1 -1.5 0.4  5 -620977.2
6: 0.25  1 -1.5 0.4  6 -620977.2

这会为 . 生成错误的值lp

编辑感谢您的评论和回复。我知道这种情况可以通过使用替代方法来解决,我的兴趣是使用data.table.

编辑再次感谢您的回复。由于目前还没有解决如何使用 明确地做到这一点的问题data.table,我假设在不转向基础 R 的情况下没有理想的方法来实现这一点。

标签: rperformancedata.tablerowwise

解决方案


如果您想获得更好的性能(时间),您可以将逐行函数重写为矩阵计算。

start <- Sys.time()
grid_mat <- as.matrix(grid[, list(b1, b2, b3, 1)])
# function parameters
N <- nrow(d); mub = 1; taub = 10; a = 0.5; z = 0.7
d$const <- 1

# combining d$y - mu in this step already
mu_op <- matrix(c(-d$const, -d$x1, -d$x2, d$y), nrow = 4, byrow = TRUE)
mu_mat <- grid_mat %*% mu_op
mub_mat <- (grid_mat[, c("b1", "b2", "b3")] - mub)^2
# just to save one calculation of the log
phi <- grid$phi
log_phi <- log(grid$phi)

grid$lp2 <- -N * log_phi -
  (1/(2*phi^2)) * rowSums(mu_mat^2) -
  (1/(2*taub^2))*( rowSums(mub_mat) ) -
  (a+1)*log_phi - (z/phi)
head(grid)
difftime(Sys.time(), start)

第一行:

     b1 b2   b3 phi id        lp       lp2
1: 0.00  1 -1.5 0.4  1 -398.7618 -398.7618
2: 0.05  1 -1.5 0.4  2 -380.3674 -380.3674
3: 0.10  1 -1.5 0.4  3 -363.5356 -363.5356
4: 0.15  1 -1.5 0.4  4 -348.2663 -348.2663
5: 0.20  1 -1.5 0.4  5 -334.5595 -334.5595
6: 0.25  1 -1.5 0.4  6 -322.4152 -322.4152

对于时间:

# on your code on my pc:
Time difference of 4.390684 secs
# my code on my pc:
Time difference of 0.680476 secs

推荐阅读