r - 将 data.table 列中以逗号分隔的字符串中的数字转换为长表形式
问题描述
给定的是一个带有字符串列的data.table 。字符串包含表示任意数量的 (x, y, z) 点的逗号分隔值(因此逗号分隔值的数量可被 3 整除,例如 '1,2,3,4,5,6' 对应于两点 (1, 2, 3), (4, 5, 6))。我想将这些字符串转换成一个长表,以便每一行只包含其中一个点。应该扩展以前的data.table,并将其他列复制到相应数量的添加行。
我解决了这个任务,但是用一个丑陋的组合strsplit + matrix
迭代单个行,lapply(1:nrow(DT))
这很可能是非常低效的。我想知道是否有更优雅的解决方案。此外,我使用 300k 行 data.table 耗尽了 RAM。
生成示例数据
library(data.table)
set.seed(1237)
N <- 5 # number of rows for test data
listlengths <- round(runif(N, 1, 5))*3 # length of row-wise comma separated lists of 3D-points
generateStrList <- function(n){
paste0(collapse = ",", round(runif(n, 0, 100)))
}
strlist <- lapply(listlengths, generateStrList)
# The follwoing data.table is given for the problem (read from a file with 'fread')
DT <- data.table(id = 1:N, b = round(runif(N, 0, 100)), c = strlist)
print(DT)
id b c
1: 1 10 80,96,40,83,86,12
2: 2 92 86,18,38,51,17,80,33,38,23,49,71,97,10,13,70
3: 3 76 84,39,86
4: 4 81 48,99,8
5: 5 56 53,92,27,2,39,62
解决任务(有点低效)
# separate the points (x, y, z) encoded in string into a long table
separateList <- function(DT){
CommaSeparatedList <- DT$c
DT_new <- as.data.table(
matrix( # convert to matrix to get 3 columns
as.numeric( # convert to numerics
strsplit(unlist(CommaSeparatedList), split = ",")[[1]]), # split string at commas into string vector (instead of list)
ncol = 3, byrow = T)
)
setnames(DT_new, c("x", "y", "z"))
DT_new[ , id := DT$id] # add columns 'id' and 'b' from original data.table,
DT_new[ , b := DT$b] # they will have the same length as the listlength / 3
return(DT_new[])
}
# test for first item only
separateList(DT[1])
x y z id b
1: 80 96 40 1 10
2: 83 86 12 1 10
# apply on whole data set
DT_Long <- rbindlist(lapply(1:nrow(DT), function(x) separateList(DT[x])))
print(DT_Long)
x y z id b
1: 80 96 40 1 10 # in DT the rows 1 and 2 here were in the first row
2: 83 86 12 1 10
3: 86 18 38 2 92 # in DT row 2 contained 5 (x, y, z) points, so are extended to five rows here
4: 51 17 80 2 92 # 'id' and 'b' are copied to fill DT_Long
5: 33 38 23 2 92
6: 49 71 97 2 92
7: 10 13 70 2 92
8: 84 39 86 3 76
9: 48 99 8 4 81
10: 53 92 27 5 56
11: 2 39 62 5 56
编辑:基准测试
给定的解决方案(稍作修改以完全匹配结果)
foo_phann <- function(DT){
DT <- rbindlist(lapply(1:nrow(DT), function(x) separateList(DT[x])))
setkey(DT, id)
return(DT[])
}
foo_ronak <- function(DT){
DT <- as.data.table(DT %>%
separate_rows(c, sep = ',') %>%
group_by(grp = ceiling(row_number()/3)) %>%
mutate(cols = c('x', 'y', 'z')) %>%
pivot_wider(names_from = cols, values_from =c) %>%
ungroup %>%
select(-grp))[ , c("x", "y", "z", "id", "b")] # changed the column order to have identical results for benchmarking and the column type
DT[ , c("x", "y", "z") := lapply(.SD, as.numeric), .SDcols = c("x", "y", "z")]
setkey(DT, id)
return(DT[])
}
foo_zx <- function(DT){
DT <- DT[, lapply(.SD, function(x) unlist(tstrsplit(x, ",", fixed = TRUE))), by = id
][, rn1 := factor(seq_len(.N) %% 3,
levels = c(1,2,0), labels = c("x", "y", "z")), by = id
][, rn2 := seq_len(.N), by = .(id, rn1)
][ , dcast(.SD, id+b+rn2~rn1, value.var = "c")][ , c("x", "y", "z", "id", "b")]
# changed the column order and column type to match the results
DT[ , c("x", "y", "z", "b") := lapply(.SD, as.numeric), .SDcols = c("x", "y", "z", "b")]
return(DT[])
}
foo_a5 <- function(DT) {
# unlist the relevant column and use strsplit, but don't make your matrices yet
a <- strsplit(unlist(DT$c, use.names = FALSE), ",", TRUE)
# expand all the other columns of the input data.table...
DT <- cbind(DT[rep(seq.int(nrow(DT)), lengths(a)/3), 1:2],
# ... and bind it with your newly formed (single) matrix
matrix(as.integer(unlist(a, use.names=FALSE)),
ncol = 3, byrow = TRUE,
dimnames = list(NULL, c("x", "y", "z"))))
setcolorder(DT, c("x", "y", "z", "id", "b"))
setkey(DT, "id")
return(DT[])
}
给出 N=1000 和 N=5000 的以下基准:
bench::mark(
Method1 = foo_phann(DT),
Method2 = foo_ronak(DT),
Method3 = foo_zx(DT),
Method4 = foo_a5(DT)
)
# N=1000
# A tibble: 4 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 Method1 1.3s 1.3s 0.766 96.05MB 3.83 1 5 1.3s <data.table~ <Rprofmem[~ <bch:t~ <tibble~
2 Method2 43.02ms 48.84ms 19.8 11.2MB 5.94 10 3 505ms <data.table~ <Rprofmem[~ <bch:t~ <tibble~
3 Method3 153.53ms 156.08ms 5.98 9.74MB 7.97 3 4 502ms <data.table~ <Rprofmem[~ <bch:t~ <tibble~
4 Method4 5.77ms 6.67ms 147. 417.88KB 1.98 74 1 505.1ms <data.table~ <Rprofmem[~ <bch:t~ <tibble~
#N = 5000
# A tibble: 4 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 Method1 6.98s 6.98s 0.143 481.2MB 5.59 1 39 6.98s <data.table~ <Rprofmem[~ <bch:t~ <tibble~
2 Method2 194.08ms 198.01ms 3.81 55.5MB 6.35 3 5 787.93ms <data.table~ <Rprofmem[~ <bch:t~ <tibble~
3 Method3 1.43s 1.43s 0.699 199.6MB 16.1 1 23 1.43s <data.table~ <Rprofmem[~ <bch:t~ <tibble~
4 Method4 12.54ms 13.79ms 68.6 1.9MB 0 35 0 509.89ms <data.table~ <Rprofmem[~ <bch:t~ <tibble~
正如预期的那样,与其他两种解决方案相比,我的解决方案(方法 1)效率低下。对于大量行,dplyr 解决方案 (Method2) 比 data.table 方法 (Method3) 更快且内存效率更高。不幸的是,在计算了我原来的 300k 行数据大约半小时后,data.table 放弃了内存错误(使用 Method2)。我想我必须首先将 data.table 拆分为多个并独立处理它们。但是,给定的解决方案都是对我的代码的很好的改进!
编辑:@A5C1D2H2I1M1N2O1R2T1 的方法 foo_a5() 无缝地贯穿我的整个数据!
解决方案
使用dplyr
andtidyr
您可以用逗号分割数据,创建 3 行组并以宽格式获取数据。
library(dplyr)
library(tidyr)
DT %>%
separate_rows(c, sep = ',') %>%
group_by(grp = ceiling(row_number()/3)) %>%
mutate(cols = c('x', 'y', 'z')) %>%
pivot_wider(names_from = cols, values_from =c) %>%
ungroup %>%
select(-grp)
# id b x y z
# <int> <dbl> <chr> <chr> <chr>
# 1 1 10 80 96 40
# 2 1 10 83 86 12
# 3 2 92 86 18 38
# 4 2 92 51 17 80
# 5 2 92 33 38 23
# 6 2 92 49 71 97
# 7 2 92 10 13 70
# 8 3 76 84 39 86
# 9 4 81 48 99 8
#10 5 56 53 92 27
#11 5 56 2 39 62
推荐阅读
- python - 为什么我在 Keras 二进制分类模型中的准确率为零?
- django - 如何使用 Django-Rest 框架逐步将数据发布到对象?
- python - 匹配时间戳
- javascript - 如何正确使用 react-router-dom 中的 useHistory()?
- sql - 为什么我必须执行相同结果的两个不同 sql 查询的行为不同?
- python - Python 编写 MIDI 文件
- emacs - 如何使 xelatex 文档中的标题在左侧对齐?
- python - 试图抓取这个网站并且代码返回一个空的 csv - PYTHON Scrapy
- flutter - 颤振医生在 PowerShell 中显示错误
- components - 点击|模型事件后 Laravel 8 Livewire 刷新数据