首页 > 解决方案 > 需要帮助找到一种快速方法来识别每个变量的第一个非缺失观测值

问题描述

我有一个大型数据集,我希望在其中使用 data.table 来识别每个分组 id 的第一个非缺失值。

我目前通过定义一个函数并使用lapply(). 我也尝试过使用mclapply(),但这似乎更慢。

### Libraries ###
library(microbenchmark)
library(ggplot2)
library(data.table)

### Dummy Data Table ###
dt <- data.table(
  id = rep(1:4, each = 4),
  var_int = c(rep(NA, 3), 1L, rep(NA, 2), 10L, rep(NA, 2), 100L, rep(NA, 2), 1000L, rep(NA, 3)),
  var_dou = c(rep(NA, 2), 1, rep(NA, 2), 1.01, rep(NA, 2), 1.001, rep(NA, 3), rep(NA, 3), 1.0001),
  var_cha = c(NA, "a", rep(NA, 2), "b", rep(NA, 6), "c", rep(NA, 2), "d", NA),
  var_intmi = c(1L, rep(NA, 14), 4L)
)
dt
##     id var_int var_dou var_cha var_intmi
##  1:  1      NA      NA    <NA>         1
##  2:  1      NA      NA       a        NA
##  3:  1      NA  1.0000    <NA>        NA
##  4:  1       1      NA    <NA>        NA
##  5:  2      NA      NA       b        NA
##  6:  2      NA  1.0100    <NA>        NA
##  7:  2      10      NA    <NA>        NA
##  8:  2      NA      NA    <NA>        NA
##  9:  3      NA  1.0010    <NA>        NA
## 10:  3     100      NA    <NA>        NA
## 11:  3      NA      NA    <NA>        NA
## 12:  3      NA      NA       c        NA
## 13:  4    1000      NA    <NA>        NA
## 14:  4      NA      NA    <NA>        NA
## 15:  4      NA      NA       d        NA
## 16:  4      NA  1.0001    <NA>         4

### Functions ###
firstnonmiss_1 <- function(x){x[which(complete.cases(x))][1]}
firstnonmiss_2 <- function(x){first(x[complete.cases(x)])}
firstnonmiss_3 <- function(x){x[complete.cases(x)][1]}

### Desired Output ###
dt[, lapply(.SD, firstnonmiss_3), by = id]
##    id var_int var_dou var_cha var_intmi
## 1:  1       1  1.0000       a         1
## 2:  2      10  1.0100       b        NA
## 3:  3     100  1.0010       c        NA
## 4:  4    1000  1.0001       d         4

### Benchmarking ###
t <- microbenchmark(
  "which()[1]" = dt[, lapply(.SD, firstnonmiss_1), by = id],
  "first()" = dt[, lapply(.SD, firstnonmiss_2), by = id],
  "[1]" = dt[, lapply(.SD, firstnonmiss_3), by = id],
  times = 1e4
)
t
## Unit: microseconds
##        expr     min       lq     mean   median       uq       max neval
##  which()[1] 414.438 426.8485 516.7795 437.9710 460.8930 161388.83 10000
##     first() 401.574 413.6190 483.2857 424.6860 446.6475  41523.61 10000
##         [1] 388.845 401.4700 468.9951 411.3505 432.2035  33320.75 10000

### Plot Outputs ###
units <- attributes(print(t))[["unit"]]
autoplot(t) +
  labs(x = "Function", y = paste0("Timings, (", units, ")")) +
  scale_x_discrete() +
  scale_y_log10() +
  geom_violin(fill = "skyblue", alpha = 0.5) +
  theme_light() +
  theme(axis.text.y = element_text(family = "Monaco", angle = 90, hjust = 0.5))

基准测试

虚拟数据集的基准测试时间还不错,但是当我在我的实际数据集(1,019 列,1,506,451 行,502,540 个组 ID)上运行该函数时,大约需要 11 分钟才能完成。是否有更好/更快的方法来获取包含每个列/变量的每个组 id 的第一个非缺失观察值的折叠数据框?

标签: rdata.tablemissing-data

解决方案


This may be a case where melting the dataset and casting is faster when there are only 3 results per each group.

Using @chinsoon12's dataset, I get 2-3 seconds with OP's original solutions vs. 0.4 s with melt and cast. If you don't mind keeping the data molten (i.e., long), that is around 0.2 seconds which is about 10x faster than the original.

#melt and cast
dcast(melt(DT, id.vars = 'grp')[!is.na(value), .SD[1], by = .(grp, variable)], grp ~ variable)

#only melt
melt(DT, id.vars = 'grp')[!is.na(value), .SD[1], by = .(grp, variable)]

#approach with intermediate variables:
molten_DT<- na.omit(melt(DT, id.vars = 'grp'), 'value')
dcast(molten_DT[molten_DT[, .I[1], by = .(grp, variable)]$V1, ], grp ~ variable)
library(data.table)
library(microbenchmark)

#@chinsoon12's dataset
set.seed(0L)
ngrp <- 1000L #502540
avgNr <- 3L
nc <- 1000L #1019
DT <- data.table(
  as.data.table(matrix(sample(c(NA,1), ngrp*avgNr*nc, TRUE), nrow=ngrp*avgNr, ncol=nc)),
  grp=rep(1:ngrp, each=avgNr))

system.time(DT[, lapply(.SD, firstnonmiss_1), by = grp])
system.time(DT[, lapply(.SD, firstnonmiss_2), by = grp])
system.time(DT[, lapply(.SD, firstnonmiss_3), by = grp])
microbenchmark(melt_and_cast = {
  dcast(melt(DT, id.vars = 'grp')[!is.na(value), .SD[1], by = .(grp, variable)], grp ~ variable)
  },melt_1 = {
    melt(DT, id.vars = 'grp')[!is.na(value), .SD[1], by = .(grp, variable)]
  }
,times = 20)

推荐阅读