首页 > 解决方案 > 将 data.table 列中以逗号分隔的字符串中的数字转换为长表形式

问题描述

给定的是一个带有字符串列的data.table 。字符串包含表示任意数量的 (x, y, z) 点的逗号分隔值(因此逗号分隔值的数量可被 3 整除,例如 '1,2,3,4,5,6' 对应于两点 (1, 2, 3), (4, 5, 6))。我想将这些字符串转换成一个长表,以便每一行只包含其中一个点。应该扩展以前的data.table,并将其他列复制到相应数量的添加行。

我解决了这个任务,但是用一个丑陋的组合strsplit + matrix迭代单个行,lapply(1:nrow(DT))这很可能是非常低效的。我想知道是否有更优雅的解决方案。此外,我使用 300k 行 data.table 耗尽了 RAM。

生成示例数据

library(data.table)
set.seed(1237)
N <- 5 # number of rows for test data
listlengths <- round(runif(N, 1, 5))*3 # length of row-wise comma separated lists of 3D-points

generateStrList <- function(n){
  paste0(collapse = ",", round(runif(n, 0, 100)))
}

strlist <- lapply(listlengths, generateStrList)

# The follwoing data.table is given for the problem (read from a file with 'fread')
DT <- data.table(id = 1:N, b = round(runif(N, 0, 100)), c = strlist)
print(DT)

   id  b                                            c
1:  1 10                            80,96,40,83,86,12
2:  2 92 86,18,38,51,17,80,33,38,23,49,71,97,10,13,70
3:  3 76                                     84,39,86
4:  4 81                                      48,99,8
5:  5 56                             53,92,27,2,39,62

解决任务(有点低效)

# separate the points (x, y, z) encoded in string into a long table
separateList <- function(DT){
  CommaSeparatedList <- DT$c
  DT_new <- as.data.table(
    matrix( # convert to matrix to get 3 columns
      as.numeric( # convert to numerics
        strsplit(unlist(CommaSeparatedList), split = ",")[[1]]), # split string at commas into string vector (instead of list)
      ncol = 3, byrow = T)
    )
  setnames(DT_new, c("x", "y", "z"))
  DT_new[ , id := DT$id] # add columns 'id' and 'b' from original data.table, 
  DT_new[ , b := DT$b]     # they will have the same length as the listlength / 3
  return(DT_new[])
}

# test for first item only
separateList(DT[1])

    x  y  z id  b
1: 80 96 40  1 10
2: 83 86 12  1 10

#  apply on whole data set
DT_Long <- rbindlist(lapply(1:nrow(DT), function(x) separateList(DT[x])))
print(DT_Long)

     x  y  z id  b
 1: 80 96 40  1 10 # in DT the rows 1 and 2 here were in the first row
 2: 83 86 12  1 10
 3: 86 18 38  2 92 # in DT row 2 contained 5 (x, y, z) points, so are extended to five rows here
 4: 51 17 80  2 92 # 'id' and 'b' are copied to fill DT_Long
 5: 33 38 23  2 92
 6: 49 71 97  2 92
 7: 10 13 70  2 92
 8: 84 39 86  3 76
 9: 48 99  8  4 81
10: 53 92 27  5 56
11:  2 39 62  5 56

编辑:基准测试

给定的解决方案(稍作修改以完全匹配结果)

foo_phann <- function(DT){
  DT <- rbindlist(lapply(1:nrow(DT), function(x) separateList(DT[x])))
  setkey(DT, id)
  return(DT[])
}

foo_ronak <- function(DT){
  DT <- as.data.table(DT %>%
    separate_rows(c, sep = ',') %>%
    group_by(grp = ceiling(row_number()/3)) %>%
    mutate(cols = c('x', 'y', 'z')) %>%
    pivot_wider(names_from = cols, values_from  =c) %>%
    ungroup %>%
    select(-grp))[ , c("x", "y", "z", "id", "b")] # changed the column order to have identical results for benchmarking and the column type
  DT[ , c("x", "y", "z") := lapply(.SD, as.numeric), .SDcols = c("x", "y", "z")]
  setkey(DT, id)
  return(DT[])
}

foo_zx <- function(DT){
  DT <- DT[, lapply(.SD, function(x) unlist(tstrsplit(x, ",", fixed = TRUE))), by = id 
     ][, rn1 := factor(seq_len(.N) %% 3, 
                       levels = c(1,2,0), labels = c("x", "y", "z")), by = id
       ][, rn2 := seq_len(.N), by = .(id, rn1)
         ][ , dcast(.SD, id+b+rn2~rn1, value.var = "c")][ , c("x", "y", "z", "id", "b")]
# changed the column order and column type to match the results
  DT[ , c("x", "y", "z", "b") := lapply(.SD, as.numeric), .SDcols = c("x", "y", "z", "b")]
  return(DT[])
}

foo_a5 <- function(DT) {
  # unlist the relevant column and use strsplit, but don't make your matrices yet
  a <- strsplit(unlist(DT$c, use.names = FALSE), ",", TRUE)
  # expand all the other columns of the input data.table...
  DT <- cbind(DT[rep(seq.int(nrow(DT)), lengths(a)/3), 1:2], 
        # ... and bind it with your newly formed (single) matrix
        matrix(as.integer(unlist(a, use.names=FALSE)),
               ncol = 3, byrow = TRUE, 
               dimnames = list(NULL, c("x", "y", "z"))))
  setcolorder(DT, c("x", "y", "z", "id", "b"))
  setkey(DT, "id")
  return(DT[])
}

给出 N=1000 和 N=5000 的以下基准:

bench::mark(
  Method1 = foo_phann(DT),
  Method2 = foo_ronak(DT),
  Method3 = foo_zx(DT),
  Method4 = foo_a5(DT)
)

# N=1000
# A tibble: 4 x 13
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result       memory      time    gc      
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>       <list>      <list>  <list>  
1 Method1        1.3s     1.3s     0.766   96.05MB     3.83     1     5       1.3s <data.table~ <Rprofmem[~ <bch:t~ <tibble~
2 Method2     43.02ms  48.84ms    19.8      11.2MB     5.94    10     3      505ms <data.table~ <Rprofmem[~ <bch:t~ <tibble~
3 Method3    153.53ms 156.08ms     5.98     9.74MB     7.97     3     4      502ms <data.table~ <Rprofmem[~ <bch:t~ <tibble~
4 Method4      5.77ms   6.67ms   147.     417.88KB     1.98    74     1    505.1ms <data.table~ <Rprofmem[~ <bch:t~ <tibble~

#N = 5000
    # A tibble: 4 x 13
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result       memory      time    gc      
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>       <list>      <list>  <list>  
1 Method1       6.98s    6.98s     0.143   481.2MB     5.59     1    39      6.98s <data.table~ <Rprofmem[~ <bch:t~ <tibble~
2 Method2    194.08ms 198.01ms     3.81     55.5MB     6.35     3     5   787.93ms <data.table~ <Rprofmem[~ <bch:t~ <tibble~
3 Method3       1.43s    1.43s     0.699   199.6MB    16.1      1    23      1.43s <data.table~ <Rprofmem[~ <bch:t~ <tibble~
4 Method4     12.54ms  13.79ms    68.6       1.9MB     0       35     0   509.89ms <data.table~ <Rprofmem[~ <bch:t~ <tibble~

正如预期的那样,与其他两种解决方案相比,我的解决方案(方法 1)效率低下。对于大量行,dplyr 解决方案 (Method2) 比 data.table 方法 (Method3) 更快且内存效率更高。不幸的是,在计算了我原来的 300k 行数据大约半小时后,data.table 放弃了内存错误(使用 Method2)。我想我必须首先将 data.table 拆分为多个并独立处理它们。但是,给定的解决方案都是对我的代码的很好的改进!

编辑:@A5C1D2H2I1M1N2O1R2T1 的方法 foo_a5() 无缝地贯穿我的整个数据!

出于纯粹的好奇心,我测试了所有四种方法以获得广泛的数字: 在此处输入图像描述

在此处输入图像描述

标签: rstringlistdata.table

解决方案


使用dplyrandtidyr您可以用逗号分割数据,创建 3 行组并以宽格式获取数据。

library(dplyr)
library(tidyr)

DT %>%
  separate_rows(c, sep = ',') %>%
  group_by(grp = ceiling(row_number()/3)) %>%
  mutate(cols = c('x', 'y', 'z')) %>%
  pivot_wider(names_from = cols, values_from  =c) %>%
  ungroup %>%
  select(-grp)

#      id     b x     y     z    
#   <int> <dbl> <chr> <chr> <chr>
# 1     1    10 80    96    40   
# 2     1    10 83    86    12   
# 3     2    92 86    18    38   
# 4     2    92 51    17    80   
# 5     2    92 33    38    23   
# 6     2    92 49    71    97   
# 7     2    92 10    13    70   
# 8     3    76 84    39    86   
# 9     4    81 48    99    8    
#10     5    56 53    92    27   
#11     5    56 2     39    62   

推荐阅读