r - 将递归列表转换为 data.table R
问题描述
我有一个递归列表列表,我想将其合并为一个data.table
. 顶级列表 ( mylist
) 的每个元素都有两个元素:
Timestamp
这是一个字符向量Value
这是一个列表
尽管有结构(下图),但每个元素似乎都有 class data.table
。
下图显示了这个列表的结构(代码太长了):
> str(mylist[[1]])
Classes ‘data.table’ and 'data.frame': 10 obs. of 2 variables:
$ Timestamp: chr "2019-06-01T00:00:00Z" "2019-06-01T00:15:00Z" "2019-06-01T00:30:00Z" "2019-06-01T00:45:00Z" ...
$ Value :List of 10
..$ : num 100
..$ : num 100
..$ : num 100
..$ : num 100
..$ : num 100
..$ : num 100
..$ : num 100
..$ : num 100
..$ : num 100
..$ : num 100
现在,我运行两个循环来得到一个组合data.table
:
循环 1 转换Timestamp
为 R 日期时间并设置key
new_list <- lapply(1:length(mylist), function(n){
z <- mylist[[n]]
c1 <- as.POSIXct(z$Timestamp, format = '%Y-%m-%dT%H:%M:%S', tz = 'UTC')
c2 <- as.numeric(unlist(z$Value))
dt <- data.table(c1 = c1, c2 = c2)
colnames(dt) <- c('time', names(mylist)[n])
setkey(dt, 'time')
return((dt))
})
key
设置为快速合并(对其他更快的方式开放)。当遇到空的data.table
(此列表中的第 4 个元素)时,此循环失败。
循环 2 将列表合并为一个data.table
显然,这只适用于循环 1 没有失败,即data.table
列表中没有空值。
dt <- new_list[ 1 ] lapply(2:length(new_list), function(k){ dt <<- merge(dt, new_list[[k]], by = 'time', all = T) })
所以,我的问题是:
- 当其中一个条目
mylist
为空data.table
或时该怎么办list
。 - 就速度和可能的错误而言,将它们全部合并的最佳方法是什么。
下面给出了示例数据,我的实际列表有 40 个条目,每个条目约为 30,000 行。
更新:组合循环
listMerge <- function(listname){
ret_list <- lapply(1:length(listname), function(n){
z <- listname[[n]]
c1 <- as.POSIXct(z$Timestamp, format = '%Y-%m-%dT%H:%M:%S', tz = 'UTC')
c2 <- as.numeric(unlist(z$Value))
dt <- data.table(c1 = c1, c2 = c2)
colnames(dt) <- c('time', names(listname)[n])
setkey(dt, 'time')
return((dt))
})
ndat <- ret_list[[1]]
lapply(2:length(ret_list), function(k){
if(nrow(ret_list[[k]]) > 0){
ndat <<- merge(ndat, h[[k]], by = 'time', all = T)
}
})
return(ndat)
}
这行得通 - 不确定是否有更快的速度获得我想要的桌子。
数据
dput(mylist)
list(A = structure(list(Timestamp = c("2019-06-01T00:00:00Z",
"2019-06-01T00:15:00Z", "2019-06-01T00:30:00Z", "2019-06-01T00:45:00Z",
"2019-06-01T01:00:00Z", "2019-06-01T01:15:00Z", "2019-06-01T01:30:00Z",
"2019-06-01T01:45:00Z", "2019-06-01T02:00:00Z", "2019-06-01T02:15:00Z"
), Value = list(100.050957, 100.080826, 100.120308, 100.053459,
100.053825, 100.04792, 100.0679, 100.088554, 100.102737,
100.103653)), row.names = c(NA, -10L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x7fe0a100a6e0>),
B = structure(list(Timestamp = c("2019-06-01T00:00:00Z",
"2019-06-01T00:15:00Z", "2019-06-01T00:30:00Z", "2019-06-01T00:45:00Z",
"2019-06-01T01:00:00Z", "2019-06-01T01:15:00Z", "2019-06-01T01:30:00Z",
"2019-06-01T01:45:00Z", "2019-06-01T02:00:00Z", "2019-06-01T02:15:00Z"
), Value = list(38.892395, 45.7738266, 53.21701, 57.08103,
62.1048546, 68.58914, 68.98703, 69.5170746, 71.49378,
78.59612)), row.names = c(NA, -10L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x7fe0a100a6e0>),
C = structure(list(Timestamp = c("2019-06-01T00:00:00Z",
"2019-06-01T00:15:00Z", "2019-06-01T00:30:00Z", "2019-06-01T00:45:00Z",
"2019-06-01T01:00:00Z", "2019-06-01T01:15:00Z", "2019-06-01T01:30:00Z",
"2019-06-01T01:45:00Z", "2019-06-01T02:00:00Z", "2019-06-01T02:15:00Z"
), Value = list(30.5898361, 29.75237, 27.63596, 26.5089836,
25.6826324, 24.909977, 24.4333439, 23.5524445, 23.1864853,
22.7402916)), row.names = c(NA, -10L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x7fe0a100a6e0>),
D = NULL, E = structure(list(Timestamp = c("2019-06-01T00:00:00Z",
"2019-06-01T00:15:00Z", "2019-06-01T00:30:00Z", "2019-06-01T00:45:00Z",
"2019-06-01T01:00:00Z", "2019-06-01T01:15:00Z", "2019-06-01T01:30:00Z",
"2019-06-01T01:45:00Z", "2019-06-01T02:00:00Z", "2019-06-01T02:15:00Z"
), Value = list(8.299942, 8.44268, 8.440144, 8.445086, 8.41551,
8.424382, 8.438655, 8.46398, 8.445853, 8.476906)), row.names = c(NA,
-10L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x7fe0a100a6e0>),
F = structure(list(Timestamp = c("2019-06-01T00:00:00Z",
"2019-06-01T00:15:00Z", "2019-06-01T00:30:00Z", "2019-06-01T00:45:00Z",
"2019-06-01T01:00:00Z", "2019-06-01T01:15:00Z", "2019-06-01T01:30:00Z",
"2019-06-01T01:45:00Z", "2019-06-01T02:00:00Z", "2019-06-01T02:15:00Z"
), Value = list(85.48002, 88.071, 87.71461, 86.2900848, 85.50101,
82.4923248, 81.78603, 82.4504547, 82.00605, 82.12493)), row.names = c(NA,
-10L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x7fe0a100a6e0>),
G = structure(list(Timestamp = c("2019-06-01T00:00:00Z",
"2019-06-01T00:15:00Z", "2019-06-01T00:30:00Z", "2019-06-01T00:45:00Z",
"2019-06-01T01:00:00Z", "2019-06-01T01:15:00Z", "2019-06-01T01:30:00Z",
"2019-06-01T01:45:00Z", "2019-06-01T02:00:00Z", "2019-06-01T02:15:00Z"
), Value = list(0.870313, 0.862552762, 0.8827777, 0.8639478,
0.849139452, 0.874981, 0.833493, 0.89307636, 0.8647241,
0.8711139)), row.names = c(NA, -10L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x7fe0a100a6e0>))
解决方案
我们可以将 'Value' 列list
转换为向量unlist
,然后rbindlist
将list
ofdata.table
转换为单个data.table
. unlist
在这里,我们还假设 OP 除了在 OP 的帖子中显示的内容之外还想做一些其他的预处理
library(data.table)
rbindlist(lapply(mylist, function(dat) if(!is.null(dat))
dat[, Value := unlist(Value)]), idcol = 'grp')
#grp Timestamp Value
#1: A 2019-06-01T00:00:00Z 100.0509570
#2: A 2019-06-01T00:15:00Z 100.0808260
#3: A 2019-06-01T00:30:00Z 100.1203080
#4: A 2019-06-01T00:45:00Z 100.0534590
#5: A 2019-06-01T01:00:00Z 100.0538250
#6: A 2019-06-01T01:15:00Z 100.0479200
#..
另外,请注意rbindlist
直接在list
嵌套list
列上应用不会自动将该列转换为vector
ie
str(rbindlist(mylist, idcol = TRUE))
#Classes ‘data.table’ and 'data.frame': 60 obs. of 3 variables:
# $ .id : chr "A" "A" "A" "A" ...
# $ Timestamp: chr "2019-06-01T00:00:00Z" "2019-06-01T00:15:00Z" #"2019-06-01T00:30:00Z" "2019-06-01T00:45:00Z" ...
# $ Value :List of 60
# ..$ : num 100
# ..$ : num 100
# ..$ : num 100
# ..$ : num 100
# ..$ : num 100
# ..$ : num 100
# ..$ : num 100
# ...
因此,我们可能必须unlist
在应用程序之前rbindlist
或之后进行
out <- rbindlist(mylist, idcol = 'grp')[, Value := unlist(Value)]
str(out)
#Classes ‘data.table’ and 'data.frame': 60 obs. of 3 variables:
# $ grp : chr "A" "A" "A" "A" ...
#$ Timestamp: chr "2019-06-01T00:00:00Z" "2019-06-01T00:15:00Z" "2019-06-01T00:30:00Z" "2019-06-01T00:45:00Z" ...
#$ Value : num 100 100 100 100 100 ...
要将“时间戳”转换为DateTime
,我们可以使用as.POSIXct
out[, Timestamp := as.POSIXct(Timestamp, format = "%Y-%m-%dT%TZ")]
更新
如果我们需要merge
通过“时间戳”执行,一种选择是转换为xts
然后执行merge
library(xts)
i1 <- !sapply(mylist, is.null)
mylist1 <- lapply(mylist[i1], function(dat) dat[, Value := unlist(Value)])
outn <- Reduce(merge, lapply(mylist1, function(x)
xts(x$Value, order.by = as.POSIXct(x$Timestamp, format = "%Y-%m-%dT%TZ"))))
colnames(outn) <- paste0("Value", seq_len(ncol(outn)))
outn
# Value1 Value2 Value3 Value4 Value5 Value6
#2019-06-01 00:00:00 100.0510 38.89240 30.58984 8.299942 85.48002 0.8703130
#2019-06-01 00:15:00 100.0808 45.77383 29.75237 8.442680 88.07100 0.8625528
#2019-06-01 00:30:00 100.1203 53.21701 27.63596 8.440144 87.71461 0.8827777
#2019-06-01 00:45:00 100.0535 57.08103 26.50898 8.445086 86.29008 0.8639478
#2019-06-01 01:00:00 100.0538 62.10485 25.68263 8.415510 85.50101 0.8491395
#2019-06-01 01:15:00 100.0479 68.58914 24.90998 8.424382 82.49232 0.8749810
#2019-06-01 01:30:00 100.0679 68.98703 24.43334 8.438655 81.78603 0.8334930
#2019-06-01 01:45:00 100.0886 69.51707 23.55244 8.463980 82.45045 0.8930764
#2019-06-01 02:00:00 100.1027 71.49378 23.18649 8.445853 82.00605 0.8647241
#2019-06-01 02:15:00 100.1037 78.59612 22.74029 8.476906 82.12493 0.8711139
由于它是一个xts
对象,因此可以plot
直接使用它来获得有关“价值”列的更多见解
plot(outn)
或者另一个选项bind_rows
来自dplyr
library(dplyr)
library(lubridate)
library(purrr)
out1 <- bind_rows(mylist, .id = 'grp')%>%
mutate(Value = flatten_dbl(Value), Timestamp = ymd_hms(Timestamp))
str(out1)
#'data.frame': 60 obs. of 3 variables:
# $ Timestamp: POSIXct, format: "2019-06-01 00:00:00" "2019-06-01 00:15:00" "2019-06-01 00:30:00" "2019-06-01 00:45:00" ...
# $ Value : num 100 100 100 100 100 ...
# $ grp : chr "A" "A" "A" "A" ...
推荐阅读
- python - Siamese network,下半部分使用dense layer而不是euclidean distance layer
- google-app-engine - 无法使用来自 github 的云构建在应用引擎上部署应用
- clojure - 使用 Overtone 从磁盘读取声音文件?
- javascript - 如何以角度获取当地时间
- certificate - 使用 RSA 签署 DTLSv1.0 证书验证消息的内容
- python - 如何在 Python 中找到负数的 sqrt
- sql - 在 Oracle 中使用 XML 数据创建 SQL 视图
- react-native - 当模态在本机反应中打开时,可触摸的突出显示不会变得模糊
- javascript - google-calendar sinon 存根似乎不起作用
- testing - 在 testcafe 中捕获数据属性