首页 > 解决方案 > 如何加快R循环

问题描述

我有一些看起来像的数据

dfr <- data.frame(pos=1:20,val=sample(90:120,20))

   pos val
1    1 116
2    2  97
3    3 100
4    4 105
5    5 112
6    6  95
7    7  91
8    8 117
9    9  98
10  10  94
11  11 110
12  12 118
13  13 120
14  14 115
15  15 103
16  16 102
17  17 109
18  18  90
19  19  93
20  20 107

我需要在 pos 的窗口大小上计算 val 的中值。我有以下功能:

#' @param dfr A data.frame with columns pos and val
#' @param win An integer denoting window size
#'
fn_median <- function(dfr,win=5)
{
  n <- nrow(dfr)
  vec_start <- vector(length=floor(n/win),mode="numeric")
  vec_end <- vector(length=floor(n/win),mode="numeric")
  vec_median <- vector(length=floor(n/win),mode="numeric")
  k <- 1
  i <- 1
  while(i<=n)
  {
    vec_start[k] <- dfr$pos[i]
    vec_end[k] <- dfr$pos[i+(win-1)]
    vec_median[k] <- median(dfr$val[i:(i+(win-1))])
    k <- k+1
    i <- i+win
  }

  return(data.frame(start=vec_start,end=vec_end,median=vec_median))
}

返回

> fn_median(dfr,5)
  start end median
1     1   5    105
2     6  10     95
3    11  15    115
4    16  20    102

基准测试

library(microbenchmark)
library(ggplot2)

autoplot(microbenchmark("loop"=fn_median(dfr,5),times=1000))

在此处输入图像描述

这段代码太慢了。我该如何改进它以使其更快?也许使用 apply 系列函数?

标签: r

解决方案


您可以使用by (或其他)data.table的整数除法来分组。pos - 15n

library(data.table)
fn_median <- function(df, n){
  setDT(df)
  df[, .(start = pos[1], end = last(pos), median = median(val))
      , by = .(drop = (pos - 1) %/% n)][, -'drop']
}

fn_median(dfr, 5)

#    start end median
# 1:     1   5    105
# 2:     6  10     95
# 3:    11  15    115
# 4:    16  20    102

编辑:基准

library(microbenchmark)
dfr <- data.frame(pos = seq_len(1e4), val = sample(1e4))
microbenchmark(fn_median(dfr, 5), fn_median2(dfr, 5), times = 10)
# Unit: milliseconds
#                expr        min         lq       mean     median         uq       max neval
#   fn_median(dfr, 5) 113.324354 131.217695 147.213517 139.283545 167.387556 188.76767    10
#  fn_median2(dfr, 5)   2.896002   3.026053   4.554341   3.448822   3.687797  15.40021    10

dfr <- data.frame(pos = seq_len(1e6), val = sample(1e6))
microbenchmark(fn_median(dfr, 5), fn_median2(dfr, 5), times = 5)
# Unit: milliseconds
#                expr        min         lq      mean     median         uq        max neval
#   fn_median(dfr, 5) 13295.8565 13710.4458 13729.029 13734.9328 13876.7450 14027.1664     5
#  fn_median2(dfr, 5)    97.7186   103.9742   120.471   119.3268   121.1799   160.1556     5

使用的功能:

library(data.table)
fn_median2 <- function(df, n){
  setDT(df)
  df[, .(start = pos[1], end = last(pos), median = median(val))
      , by = .(drop = (pos - 1) %/% n)][, -'drop']
}



fn_median <- function(dfr,win=5)
{
  n <- nrow(dfr)
  vec_start <- vector(length=floor(n/win),mode="numeric")
  vec_end <- vector(length=floor(n/win),mode="numeric")
  vec_median <- vector(length=floor(n/win),mode="numeric")
  k <- 1
  i <- 1
  while(i<=n)
  {
    vec_start[k] <- dfr$pos[i]
    vec_end[k] <- dfr$pos[i+(win-1)]
    vec_median[k] <- median(dfr$val[i:(i+(win-1))])
    k <- k+1
    i <- i+win
  }

  return(data.frame(start=vec_start,end=vec_end,median=vec_median))
}

推荐阅读