首页 > 解决方案 > 解析每一行中的一个字符串,并将结果展开为一个整洁的数据框

问题描述

我有一个像这样的数据框:

# A tibble: 2 x 3
     id text_1                                              text_2                                                          
  <int> <chr>                                               <chr>                                                           
1     1 "{1=>{:name=>\"aaa\", :priority=>0, :count=>4}, 7=… "{:name=>\"bbb\", :priority=>0,  :count=>4}, {:name=>\"ddd\", :…
2     2 "{1=>{:name=>\"aaa\", :priority=>0, :count=>5}, 3=… "{:name=>\"bbb\", :priority=>0,  :count=>4}, {:name=>\"ccc\", :…

可重现:

structure(list(id = 1:2, text_1 = c("{1=>{:name=>\"aaa\", :priority=>0, :count=>4}, 7=>{:name=>\"bbb\", :priority=>0, :count=>2}}", 
"{1=>{:name=>\"aaa\", :priority=>0, :count=>5}, 3=>{:name=>\"ccc\", :priority=>0, :count=>3}}"
), text_2 = c("{:name=>\"bbb\", :priority=>0,  :count=>4}, {:name=>\"ddd\", :priority=>0, :count=>2}", 
"{:name=>\"bbb\", :priority=>0,  :count=>4}, {:name=>\"ccc\", :priority=>0, :count=>2}, {:name=>\"ddd\", :priority=>0, :count=>9}"
)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"
), spec = structure(list(cols = list(id = structure(list(), class = c("collector_integer", 
"collector")), text_1 = structure(list(), class = c("collector_character", 
"collector")), text_2 = structure(list(), class = c("collector_character", 
"collector"))), default = structure(list(), class = c("collector_guess", 
"collector"))), class = "col_spec"))

其中每一行都包含一个要解析的字符串。

第一列包含带有标识符的字段,第二列中的字段正好在一组括号之间。

我想展开它以达到这个结果,以将在该id级别上不常见的产品设置为零的方式。

产品数量有限,因此需要填写所有组合。

# A tibble: 14 x 5
      id product priority count level 
   <int> <chr>      <int> <int> <chr> 
 1     1 aaa            0     4 text_1
 2     1 bbb            0     4 text_1
 3     1 ccc            0     0 text_1
 4     2 aaa            0     5 text_1
 5     2 bbb            0     0 text_1
 6     2 ccc            0     3 text_1
 7     1 aaa            0     0 text_2
 8     1 bbb            0     4 text_2
 9     1 ccc            0     0 text_2
10     1 ddd            0     2 text_2
11     2 aaa            0     0 text_2
12     2 bbb            0     4 text_2
13     2 ccc            0     2 text_2
14     2 ddd            0     9 text_2

我想我必须使用某种组合,extract但我迷路了。

标签: rregexdplyrtidyversetidyr

解决方案


如果它最初是 JSON,正如@neilfws 所建议的那样 - 使用包之类的东西会更容易解析jsonlite。如果没有,请尝试以下操作:

library(tidyr)
library(dplyr)
library(stringr)

df %>%
  gather(level, 'val', text_1, text_2) %>%
  separate(val, into = paste0('val', 1:(max(str_count(.$val,"\\},"))+1)), "\\},") %>%
  gather(val, ugly_text, starts_with('val')) %>%
  select(-val) %>%
  filter(!is.na(ugly_text)) %>%
  mutate(product = str_match(ugly_text, ':name=>\\"(.*?)\\"')[, 2],
         priority = str_match(ugly_text , ':priority=>([0-9])')[, 2],
         count = str_match(ugly_text , ':count=>([0-9])')[, 2]) %>%
  select(id, product, priority, count, level) %>%
  full_join(distinct(expand.grid(id = .$id, product = .$product, level = .$level)),
            by = c('id', 'product', 'level')) %>%
  mutate_at(vars(priority, count), ~if_else(is.na(.x), 0, as.numeric(.x))) %>%
  arrange(level, id, product)

# A tibble: 16 x 5
      id product priority count level 
   <int> <chr>      <dbl> <dbl> <chr> 
 1     1 aaa            0     4 text_1
 2     1 bbb            0     2 text_1
 3     1 ccc            0     0 text_1
 4     1 ddd            0     0 text_1
 5     2 aaa            0     5 text_1
 6     2 bbb            0     0 text_1
 7     2 ccc            0     3 text_1
 8     2 ddd            0     0 text_1
 9     1 aaa            0     0 text_2
10     1 bbb            0     4 text_2
11     1 ccc            0     0 text_2
12     1 ddd            0     2 text_2
13     2 aaa            0     0 text_2
14     2 bbb            0     4 text_2
15     2 ccc            0     2 text_2
16     2 ddd            0     9 text_2

我不确定您在写“产品数量有限,因此需要用所有组合填充”时的意思-但这至少应该是一个开始。


推荐阅读