r - 在隔离一行但不使用 apply 迭代数据帧 R 的行时解析 JSON 数据有效
问题描述
这是我的数据集的示例:
test <- data.frame(index = rep(0, each = 10),
id = runif(10, min=1, max=100),
creation_date = random_datetime(10, st = "2015/01/01", et = "2015/12/31"),
json_data = c("{'alldata': {'value': {'set': {'minA': 60, 'maxA': 190}, 'values': [[0, 0, 0, 0, 0, 0, 0, 56, 80, 95, 120, 140, 160, 156, 100, 77, 90, 89, 100, 112, 111, 89, 78, 178, 90, 80]], 'optimal': {'minB': 40, 'maxB': 190}, 'feature': 'postprandial after 3pm', 'type': {'timerange': 'Yes 3PM-7PM', 'hrs': '4Hr'}}, 'name': 'insight.A'}}",
"{'alldata': {'value': {'set': {'minA': 60, 'maxA': 190}, 'values': [[0, 0, 0, 0, 0, 0, 0, 56, 80, 95, 120, 140, 160, 156, 100, 77, 90, 89, 100, 112, 111, 89, 78, 178, 90, 80]], 'optimal': {'minB': 40, 'maxB': 190}, 'feature': 'postprandial after 3pm', 'type': {'timerange': 'Yes 3PM-7PM', 'hrs': '4Hr'}}, 'name': 'insight.A'}}",
"{'alldata': {'value': {'set': {'minA': 60, 'maxA': 190}, 'values': [[0, 0, 0, 0, 0, 0, 0, 56, 80, 95, 120, 140, 160, 156, 100, 77, 90, 89, 100, 112, 111, 89, 78, 178, 90, 80]], 'optimal': {'minB': 40, 'maxB': 190}, 'feature': 'postprandial after 3pm', 'type': {'timerange': 'Yes 3PM-7PM', 'hrs': '4Hr'}}, 'name': 'insight.A'}}",
"{'alldata': {'value': {'set': {'minA': 60, 'maxA': 190}, 'values': [[0, 0, 0, 0, 0, 0, 0, 56, 80, 95, 120, 140, 160, 156, 100, 77, 90, 89, 100, 112, 111, 89, 78, 178, 90, 80]], 'optimal': {'minB': 40, 'maxB': 190}, 'feature': 'postprandial after 3pm', 'type': {'timerange': 'Yes 3PM-7PM', 'hrs': '4Hr'}}, 'name': 'insight.A'}}",
"{'alldata': {'value': {'set': {'minA': 60, 'maxA': 190}, 'values': [[0, 0, 0, 0, 0, 0, 0, 56, 80, 95, 120, 140, 160, 156, 100, 77, 90, 89, 100, 112, 111, 89, 78, 178, 90, 80]], 'optimal': {'minB': 40, 'maxB': 190}, 'feature': 'postprandial after 3pm', 'type': {'timerange': 'Yes 3PM-7PM', 'hrs': '4Hr'}}, 'name': 'insight.A'}}",
"{'alldata': {'value': {'set': {'minA': 60, 'maxA': 190}, 'values': [[0, 0, 0, 0, 0, 0, 0, 56, 80, 95, 120, 140, 160, 156, 100, 77, 90, 89, 100, 112, 111, 89, 78, 178, 90, 80]], 'optimal': {'minB': 40, 'maxB': 190}, 'feature': 'postprandial after 3pm', 'type': {'timerange': 'Yes 3PM-7PM', 'hrs': '4Hr'}}, 'name': 'insight.A'}}",
"{'alldata': {'value': {'set': {'minA': 60, 'maxA': 190}, 'values': [[0, 0, 0, 0, 0, 0, 0, 56, 80, 95, 120, 140, 160, 156, 100, 77, 90, 89, 100, 112, 111, 89, 78, 178, 90, 80]], 'optimal': {'minB': 40, 'maxB': 190}, 'feature': 'postprandial after 3pm', 'type': {'timerange': 'Yes 3PM-7PM', 'hrs': '4Hr'}}, 'name': 'insight.A'}}",
"{'alldata': {'value': {'set': {'minA': 60, 'maxA': 190}, 'values': [[0, 0, 0, 0, 0, 0, 0, 56, 80, 95, 120, 140, 160, 156, 100, 77, 90, 89, 100, 112, 111, 89, 78, 178, 90, 80]], 'optimal': {'minB': 40, 'maxB': 190}, 'feature': 'postprandial after 3pm', 'type': {'timerange': 'Yes 3PM-7PM', 'hrs': '4Hr'}}, 'name': 'insight.A'}}",
"{'alldata': {'value': {'set': {'minA': 60, 'maxA': 190}, 'values': [[0, 0, 0, 0, 0, 0, 0, 56, 80, 95, 120, 140, 160, 156, 100, 77, 90, 89, 100, 112, 111, 89, 78, 178, 90, 80]], 'optimal': {'minB': 40, 'maxB': 190}, 'feature': 'postprandial after 3pm', 'type': {'timerange': 'Yes 3PM-7PM', 'hrs': '4Hr'}}, 'name': 'insight.A'}}",
"{'alldata': {'value': {'set': {'minA': 60, 'maxA': 190}, 'values': [[0, 0, 0, 0, 0, 0, 0, 56, 80, 95, 120, 140, 160, 156, 100, 77, 90, 89, 100, 112, 111, 89, 78, 178, 90, 80]], 'optimal': {'minB': 40, 'maxB': 190}, 'feature': 'postprandial after 3pm', 'type': {'timerange': 'Yes 3PM-7PM', 'hrs': '4Hr'}}, 'name': 'insight.A'}}")
)
这是我试图应用于数据集每一行的函数:
preprocess <- function (x) {
one_row <- as.data.frame(x)
processed <-
rrapply::rrapply(fromJSON(gsub("'", '"', one_row$json_data),
flatten = TRUE), f = unlist, how = 'flatten') %>%
append(one_row[-length(one_row)], .) %>%
map(as.vector) %>%
stack %>%
mutate(rn = rowid(ind)) %>%
pivot_wider(names_from = ind, values_from = values) %>%
select(-rn) %>%
fill(everything()) %>%
type.convert(as.is = TRUE)
lower_range <- 70
upper_range <- 180
SG <- processed$values
processed$TIR_pre <- sum(SG > lower_range & SG < upper_range)/length(SG)
processed$TAB_pre <- sum(SG > upper_range)/length(SG)
processed$TBR_pre <- sum(SG < lower_range)/length(SG)
result <- processed %>%
select(-values) %>%
distinct()
print(result)
}
当我选择一行时,该功能起作用:
preprocess(test[1,])
输出:
# A tibble: 1 x 14
index id creation_date minA maxA minB maxB feature timerange hrs name TIR_pre TAB_pre TBR_pre
<int> <dbl> <dbl> <int> <int> <int> <int> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 0 11.9 1421724981. 60 190 40 190 postprandial aft~ Yes 3PM-7PM 4Hr insigh~ 0.692 0 0.308
但是当我尝试在数据集的每一行上迭代函数时,它不起作用:
apply(test, 1, preprocess)
错误:
Error: lexical error: invalid char in json text.
NA
(right here) ------^
任何关于为什么的建议将不胜感激。提前致谢。
解决方案
考虑使用lapply
lapply(seq_len(nrow(test)), function(i) preprocess(test[i, ]))
# A tibble: 1 x 14
index id creation_date minA maxA minB maxB feature timerange hrs name TIR_pre TAB_pre TBR_pre
<int> <dbl> <dbl> <int> <int> <int> <int> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 0 59.6 1420788079. 60 190 40 190 postprandial after 3pm Yes 3PM-7PM 4Hr insight.A 0.692 0 0.308
# A tibble: 1 x 14
index id creation_date minA maxA minB maxB feature timerange hrs name TIR_pre TAB_pre TBR_pre
<int> <dbl> <dbl> <int> <int> <int> <int> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 0 12.9 1423099725. 60 190 40 190 postprandial after 3pm Yes 3PM-7PM 4Hr insight.A 0.692 0 0.308
# A tibble: 1 x 14
index id creation_date minA maxA minB maxB feature timerange hrs name TIR_pre TAB_pre TBR_pre
<int> <dbl> <dbl> <int> <int> <int> <int> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 0 63.2 1425300550. 60 190 40 190 postprandial after 3pm Yes 3PM-7PM 4Hr insight.A 0.692 0 0.308
# A tibble: 1 x 14
index id creation_date minA maxA minB maxB feature timerange hrs name TIR_pre TAB_pre TBR_pre
<int> <dbl> <dbl> <int> <int> <int> <int> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 0 13.5 1429542735. 60 190 40 190 postprandial after 3pm Yes 3PM-7PM 4Hr insight.A 0.692 0 0.308
# A tibble: 1 x 14
index id creation_date minA maxA minB maxB feature timerange hrs name TIR_pre TAB_pre TBR_pre
<int> <dbl> <dbl> <int> <int> <int> <int> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 0 3.36 1429611375. 60 190 40 190 postprandial after 3pm Yes 3PM-7PM 4Hr insight.A 0.692 0 0.308
# A tibble: 1 x 14
index id creation_date minA maxA minB maxB feature timerange hrs name TIR_pre TAB_pre TBR_pre
<int> <dbl> <dbl> <int> <int> <int> <int> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 0 83.9 1429865895. 60 190 40 190 postprandial after 3pm Yes 3PM-7PM 4Hr insight.A 0.692 0 0.308
...
要创建单个数据集rbind
,list
元素
out <- do.call(rbind, lapply(seq_len(nrow(test)),
function(i) preprocess(test[i, ])))
甚至rowwise
library(dplyr)
test %>%
rowwise %>%
transmute(out = list(preprocess(cur_data()))) %>%
pull(out) %>%
bind_rows %>%
bind_cols(test, .)
问题apply
在于它返回一个matrix
并且matrix
只能有一个类,即如果有任何字符元素,它将整个转换为字符类
推荐阅读
- java - 当一个函数返回一个包含泛型的引用类型时,为什么泛型的规则似乎发生了变化?
- svg - Using SVG markers in google_maps_flutter Flutter plugin
- assembly - 为什么寻址错误?x86实模式引导扇区代码
- r - 如何从 R 中的合并功能中提取我想要的输出
- php - 当我尝试访问某个页面时,如何解决 403 访问禁止错误?
- ios - iOS CallKit 广告集成
- javascript - 将 JS Pulltorefresh 实现到 Angularjs 中的问题
- javascript - 发布复选框值/通过电子邮件发送
- excel - 在用户窗体中分组可移动项目
- google-apps-script - Google 表格 - 仅针对选定的表格发送电子邮件/slack