首页 > 解决方案 > 在隔离一行但不使用 apply 迭代数据帧 R 的行时解析 JSON 数据有效

问题描述

这是我的数据集的示例:

test <- data.frame(index = rep(0, each = 10),
                   id = runif(10, min=1, max=100),
                   creation_date = random_datetime(10, st = "2015/01/01", et = "2015/12/31"),
                   json_data = c("{'alldata': {'value': {'set': {'minA': 60, 'maxA': 190}, 'values': [[0, 0, 0, 0, 0, 0, 0, 56, 80, 95, 120, 140, 160, 156, 100, 77, 90, 89, 100, 112, 111, 89, 78, 178, 90, 80]], 'optimal': {'minB': 40, 'maxB': 190}, 'feature': 'postprandial after 3pm', 'type': {'timerange': 'Yes 3PM-7PM', 'hrs': '4Hr'}}, 'name': 'insight.A'}}",
                                 "{'alldata': {'value': {'set': {'minA': 60, 'maxA': 190}, 'values': [[0, 0, 0, 0, 0, 0, 0, 56, 80, 95, 120, 140, 160, 156, 100, 77, 90, 89, 100, 112, 111, 89, 78, 178, 90, 80]], 'optimal': {'minB': 40, 'maxB': 190}, 'feature': 'postprandial after 3pm', 'type': {'timerange': 'Yes 3PM-7PM', 'hrs': '4Hr'}}, 'name': 'insight.A'}}",
                                 "{'alldata': {'value': {'set': {'minA': 60, 'maxA': 190}, 'values': [[0, 0, 0, 0, 0, 0, 0, 56, 80, 95, 120, 140, 160, 156, 100, 77, 90, 89, 100, 112, 111, 89, 78, 178, 90, 80]], 'optimal': {'minB': 40, 'maxB': 190}, 'feature': 'postprandial after 3pm', 'type': {'timerange': 'Yes 3PM-7PM', 'hrs': '4Hr'}}, 'name': 'insight.A'}}",
                                 "{'alldata': {'value': {'set': {'minA': 60, 'maxA': 190}, 'values': [[0, 0, 0, 0, 0, 0, 0, 56, 80, 95, 120, 140, 160, 156, 100, 77, 90, 89, 100, 112, 111, 89, 78, 178, 90, 80]], 'optimal': {'minB': 40, 'maxB': 190}, 'feature': 'postprandial after 3pm', 'type': {'timerange': 'Yes 3PM-7PM', 'hrs': '4Hr'}}, 'name': 'insight.A'}}",
                                 "{'alldata': {'value': {'set': {'minA': 60, 'maxA': 190}, 'values': [[0, 0, 0, 0, 0, 0, 0, 56, 80, 95, 120, 140, 160, 156, 100, 77, 90, 89, 100, 112, 111, 89, 78, 178, 90, 80]], 'optimal': {'minB': 40, 'maxB': 190}, 'feature': 'postprandial after 3pm', 'type': {'timerange': 'Yes 3PM-7PM', 'hrs': '4Hr'}}, 'name': 'insight.A'}}",
                                 "{'alldata': {'value': {'set': {'minA': 60, 'maxA': 190}, 'values': [[0, 0, 0, 0, 0, 0, 0, 56, 80, 95, 120, 140, 160, 156, 100, 77, 90, 89, 100, 112, 111, 89, 78, 178, 90, 80]], 'optimal': {'minB': 40, 'maxB': 190}, 'feature': 'postprandial after 3pm', 'type': {'timerange': 'Yes 3PM-7PM', 'hrs': '4Hr'}}, 'name': 'insight.A'}}",
                                 "{'alldata': {'value': {'set': {'minA': 60, 'maxA': 190}, 'values': [[0, 0, 0, 0, 0, 0, 0, 56, 80, 95, 120, 140, 160, 156, 100, 77, 90, 89, 100, 112, 111, 89, 78, 178, 90, 80]], 'optimal': {'minB': 40, 'maxB': 190}, 'feature': 'postprandial after 3pm', 'type': {'timerange': 'Yes 3PM-7PM', 'hrs': '4Hr'}}, 'name': 'insight.A'}}",
                                 "{'alldata': {'value': {'set': {'minA': 60, 'maxA': 190}, 'values': [[0, 0, 0, 0, 0, 0, 0, 56, 80, 95, 120, 140, 160, 156, 100, 77, 90, 89, 100, 112, 111, 89, 78, 178, 90, 80]], 'optimal': {'minB': 40, 'maxB': 190}, 'feature': 'postprandial after 3pm', 'type': {'timerange': 'Yes 3PM-7PM', 'hrs': '4Hr'}}, 'name': 'insight.A'}}",
                                 "{'alldata': {'value': {'set': {'minA': 60, 'maxA': 190}, 'values': [[0, 0, 0, 0, 0, 0, 0, 56, 80, 95, 120, 140, 160, 156, 100, 77, 90, 89, 100, 112, 111, 89, 78, 178, 90, 80]], 'optimal': {'minB': 40, 'maxB': 190}, 'feature': 'postprandial after 3pm', 'type': {'timerange': 'Yes 3PM-7PM', 'hrs': '4Hr'}}, 'name': 'insight.A'}}",
                                 "{'alldata': {'value': {'set': {'minA': 60, 'maxA': 190}, 'values': [[0, 0, 0, 0, 0, 0, 0, 56, 80, 95, 120, 140, 160, 156, 100, 77, 90, 89, 100, 112, 111, 89, 78, 178, 90, 80]], 'optimal': {'minB': 40, 'maxB': 190}, 'feature': 'postprandial after 3pm', 'type': {'timerange': 'Yes 3PM-7PM', 'hrs': '4Hr'}}, 'name': 'insight.A'}}")
)

这是我试图应用于数据集每一行的函数:

preprocess <- function (x) {
  
  one_row <- as.data.frame(x)
  
  processed <-
    rrapply::rrapply(fromJSON(gsub("'", '"', one_row$json_data),
                              flatten = TRUE), f = unlist, how = 'flatten') %>%
    append(one_row[-length(one_row)], .) %>%
    map(as.vector) %>%
    stack %>%
    mutate(rn = rowid(ind)) %>%
    pivot_wider(names_from = ind, values_from = values) %>%
    select(-rn) %>%
    fill(everything()) %>%
    type.convert(as.is = TRUE)
  
  lower_range <- 70
  upper_range <- 180
  SG <- processed$values
  
  processed$TIR_pre <- sum(SG > lower_range & SG < upper_range)/length(SG)
  processed$TAB_pre <- sum(SG > upper_range)/length(SG)
  processed$TBR_pre <- sum(SG < lower_range)/length(SG)
  
  result <- processed %>%
    select(-values) %>% 
    distinct()
  
  print(result)
}

当我选择一行时,该功能起作用:

preprocess(test[1,])

输出:

# A tibble: 1 x 14
  index    id creation_date  minA  maxA  minB  maxB feature           timerange   hrs   name    TIR_pre TAB_pre TBR_pre
  <int> <dbl>         <dbl> <int> <int> <int> <int> <chr>             <chr>       <chr> <chr>     <dbl>   <dbl>   <dbl>
1     0  11.9   1421724981.    60   190    40   190 postprandial aft~ Yes 3PM-7PM 4Hr   insigh~   0.692       0   0.308

但是当我尝试在数据集的每一行上迭代函数时,它不起作用:

apply(test, 1, preprocess)

错误:

 Error: lexical error: invalid char in json text.
                                       NA
                     (right here) ------^ 

任何关于为什么的建议将不胜感激。提前致谢。

标签: rjson

解决方案


考虑使用lapply

lapply(seq_len(nrow(test)), function(i) preprocess(test[i, ]))
# A tibble: 1 x 14
  index    id creation_date  minA  maxA  minB  maxB feature                timerange   hrs   name      TIR_pre TAB_pre TBR_pre
  <int> <dbl>         <dbl> <int> <int> <int> <int> <chr>                  <chr>       <chr> <chr>       <dbl>   <dbl>   <dbl>
1     0  59.6   1420788079.    60   190    40   190 postprandial after 3pm Yes 3PM-7PM 4Hr   insight.A   0.692       0   0.308
# A tibble: 1 x 14
  index    id creation_date  minA  maxA  minB  maxB feature                timerange   hrs   name      TIR_pre TAB_pre TBR_pre
  <int> <dbl>         <dbl> <int> <int> <int> <int> <chr>                  <chr>       <chr> <chr>       <dbl>   <dbl>   <dbl>
1     0  12.9   1423099725.    60   190    40   190 postprandial after 3pm Yes 3PM-7PM 4Hr   insight.A   0.692       0   0.308
# A tibble: 1 x 14
  index    id creation_date  minA  maxA  minB  maxB feature                timerange   hrs   name      TIR_pre TAB_pre TBR_pre
  <int> <dbl>         <dbl> <int> <int> <int> <int> <chr>                  <chr>       <chr> <chr>       <dbl>   <dbl>   <dbl>
1     0  63.2   1425300550.    60   190    40   190 postprandial after 3pm Yes 3PM-7PM 4Hr   insight.A   0.692       0   0.308
# A tibble: 1 x 14
  index    id creation_date  minA  maxA  minB  maxB feature                timerange   hrs   name      TIR_pre TAB_pre TBR_pre
  <int> <dbl>         <dbl> <int> <int> <int> <int> <chr>                  <chr>       <chr> <chr>       <dbl>   <dbl>   <dbl>
1     0  13.5   1429542735.    60   190    40   190 postprandial after 3pm Yes 3PM-7PM 4Hr   insight.A   0.692       0   0.308
# A tibble: 1 x 14
  index    id creation_date  minA  maxA  minB  maxB feature                timerange   hrs   name      TIR_pre TAB_pre TBR_pre
  <int> <dbl>         <dbl> <int> <int> <int> <int> <chr>                  <chr>       <chr> <chr>       <dbl>   <dbl>   <dbl>
1     0  3.36   1429611375.    60   190    40   190 postprandial after 3pm Yes 3PM-7PM 4Hr   insight.A   0.692       0   0.308
# A tibble: 1 x 14
  index    id creation_date  minA  maxA  minB  maxB feature                timerange   hrs   name      TIR_pre TAB_pre TBR_pre
  <int> <dbl>         <dbl> <int> <int> <int> <int> <chr>                  <chr>       <chr> <chr>       <dbl>   <dbl>   <dbl>
1     0  83.9   1429865895.    60   190    40   190 postprandial after 3pm Yes 3PM-7PM 4Hr   insight.A   0.692       0   0.308
...

要创建单个数据集rbindlist元素

out <- do.call(rbind, lapply(seq_len(nrow(test)), 
    function(i) preprocess(test[i, ])))

甚至rowwise

library(dplyr)
test %>% 
    rowwise %>% 
    transmute(out = list(preprocess(cur_data()))) %>% 
    pull(out) %>%
    bind_rows %>%
    bind_cols(test, .)

问题apply在于它返回一个matrix并且matrix只能有一个类,即如果有任何字符元素,它将整个转换为字符类


推荐阅读