首页 > 解决方案 > 通过长格式的某个变量获取索引

问题描述

我有一些长格式的数据,

library(data.table)
dat <- data.table(id=1:12, group=rep(1:2, each=6), time=c(rep(9:8, each=3),rep(6:7, each=3)), measure=1:3)
> dat
    id group time measure
 1:  1     1    9       1
 2:  2     1    9       2
 3:  3     1    9       3
 4:  4     1    8       1
 5:  5     1    8       2
 6:  6     1    8       3
 7:  7     2    6       1
 8:  8     2    6       2
 9:  9     2    6       3
10: 10     2    7       1
11: 11     2    7       2
12: 12     2    7       3

我想创建一个新变量,它给出time每个group. 也就是说,期望的输出是

> res
    id group time measure index
 1:  1     1    9       1     2
 2:  2     1    9       2     2
 3:  3     1    9       3     2
 4:  4     1    8       1     1
 5:  5     1    8       2     1
 6:  6     1    8       3     1
 7:  7     2    6       1     1
 8:  8     2    6       2     1
 9:  9     2    6       3     1
10: 10     2    7       1     2
11: 11     2    7       2     2
12: 12     2    7       3     2

如果每个组内每次只有一行(即没有measure变量),我会按照以下方式做一些事情

dat[order(group,time), .(index=seq_len(.N)), by=.(group)]

但在这种情况下,我不知所措。

标签: rindexingdata.table

解决方案


OP 与他/她的尝试非常接近。这是使用的另一个选项rleid

DT1[order(group, time), index := rleid(time), group]

计时码:

library(data.table)
set.seed(0L)
nr <- 1e6
ng <- nr/10
nt <- nr/2
DT <- data.table(group=sample(ng, nr, TRUE), time=sample(nt, nr, TRUE))
DT0 <- copy(DT)
DT1 <- copy(DT)

mtd0 <- function() DT0[, index := frank(time, ties.method = "dense"), group]

mtd1 <- function() DT1[order(group, time), index := rleid(time), group]

bench::mark(mtd0(), mtd1(), check=FALSE)
identical(DT1$index, DT0$index)
#[1] TRUE

时间:

# A tibble: 2 x 13
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result              memory            time    gc          
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>              <list>            <list>  <list>      
1 mtd0()          42s      42s    0.0238    1.57GB    0.691     1    29        42s <df[,2] [1,000,000~ <df[,3] [300,811~ <bch:t~ <tibble [1 ~
2 mtd1()        398ms    404ms    2.47     28.29MB    3.71      2     3      809ms <df[,2] [1,000,000~ <df[,3] [494 x 3~ <bch:t~ <tibble [2 ~

另一个比较:

mtd2 <- function() DT2[, g := .GRP, keyby=.(group, time)][, g2 := g - first(g) + 1L, by=group]

bench::mark(#mtd0(), 
    mtd1(), mtd2(), check=FALSE)

时间:

# A tibble: 2 x 13
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result               memory          time    gc           
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>               <list>          <list>  <list>       
1 mtd1()        370ms    372ms      2.69    24.5MB     4.03     2     3      745ms <df[,3] [1,000,000 ~ <df[,3] [101 x~ <bch:t~ <tibble [2 x~
2 mtd2()        464ms    469ms      2.13    23.7MB     1.07     2     1      937ms <df[,4] [1,000,000 ~ <df[,3] [16 x ~ <bch:t~ <tibble [2 x~

推荐阅读