首页 > 解决方案 > 用r中的运算符提取字符串的左侧和右侧

问题描述

我有一个数据框包含来自随机森林模型的公式(使用 getTree 收集的树和用户定义函数获得的公式)。

示例输入:

'%!in%' <- function(x,y)!('%in%'(x,y))

example <- data.frame(formulas = c("train %>% filter( balance > 599.5 & balance < 605.5 & 
                                                      contact %!in% c( 'cellular', 'telephone' ) & balance > 566.5 & 
                                                      job_group1 %!in% c( 'services' ) & 
                                                      education %in% c( 'primary' ) & month_group1 %in% c( 'jun' ) & 
                                                      education %!in% c( 'tertiary' ) & 
                                                      job_group1 %!in% c( 'blue-collar' ) )",
                                   "train %>% filter( day > 9.5 & 
                                                      month_group1 %!in% c( 'apr', 'feb', 'jan', 'nov', 'oct' ) & 
                                                      balance > 443.5 & balance < 597 & balance < 650.5 & 
                                                      job_group1 %!in% c( 'admin.', 'blue-collar', 'management', 'retired' ) & 
                                                      month_group1 %!in% c( 'jul' ) & 
                                                      contact %in% c( 'cellular', 'telephone' ) &
                                                      month_group1 %!in% c( 'aug' ) & housing %in% c( 'no' ) & 
                                                      education %!in% c( 'secondary' ) & 
                                                      education %!in% c( 'tertiary' ) &
                                                      month_group1 %!in% c( 'jun', 'may' ) )" ),
                      cluster = c("1","2") )                                                                                    

我的期望是提取运算符的左侧、右侧、运算符和集群名称并将其保存到新的数据框中。

预期产出

desired <- data.frame( LHS = c("balance","balance","contact","balance","job_group1",
                               "education","month_group1", "education","job_group1",
                               "day","month_group1","balance","balance","balance",
                               "job_group1","month_group1","contact","month_group1","housing",
                               "education","education","month_group1"),
                       RHS = c(599.5,605.5,c("cellular, telephone"),566.5,c("services"),
                               c("primary"),c("jun"), c("tertiary"),c("blue-collar"),
                               9.5,c("apr,feb,jan,nov,oct"),443.5,597,650.5,c("admin.,blue-collar,
                               management,retired"), c("jul"), c("cellular,telephone"), c("aug"),
                               c("no"),c("secondary"),c("tertiary"),c("jun,may")),
                       Operator = c(">", "<", "%!in%", ">", "%!in%", "%in%", "%in%",
                                    "%!in%", "%!in%",">", "%!in%", ">", "<", "<", "%!in%",
                                    "%!in%", "%in%", "%!in%", "%in%", "%!in%", "%!in%", "%!in%"),
                       cluster = c(1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2) )

我正在使用 R 版本 3.3.3,因为项目需要它。

标签: r

解决方案


你可以使用

library(dplyr)
library(tidyr)
library(stringr)

example %>% 
#  tibble() %>% 
  mutate(formulas = str_remove_all(formulas, "^train %>% filter\\( |\\s*\\)$"),
         formulas = str_split(formulas, "&")) %>% 
  unnest(formulas) %>% 
  mutate(LHS = str_extract(formulas, ".*(?=(<|>|%in%|%!in%))"),
         RHS = str_extract(formulas, "(?<=(<|>|%in%|%!in%)).*"),
         operator = str_extract(formulas, "<|>|%in%|%!in%"),
         across(c(LHS, RHS, operator), str_trim)) %>% 
  select(-formulas)

返回

# A tibble: 22 x 4
   cluster LHS          RHS                                                   operator
   <chr>   <chr>        <chr>                                                 <chr>   
 1 1       balance      599.5                                                 >       
 2 1       balance      605.5                                                 <       
 3 1       contact      c( 'cellular', 'telephone' )                          %!in%   
 4 1       balance      566.5                                                 >       
 5 1       job_group1   c( 'services' )                                       %!in%   
 6 1       education    c( 'primary' )                                        %in%    
 7 1       month_group1 c( 'jun' )                                            %in%    
 8 1       education    c( 'tertiary' )                                       %!in%   
 9 1       job_group1   c( 'blue-collar' )                                    %!in%   
10 2       day          9.5                                                   >       
11 2       month_group1 c( 'apr', 'feb', 'jan', 'nov', 'oct' )                %!in%   
12 2       balance      443.5                                                 >       
13 2       balance      597                                                   <       
14 2       balance      650.5                                                 <       
15 2       job_group1   c( 'admin.', 'blue-collar', 'management', 'retired' ) %!in%   
16 2       month_group1 c( 'jul' )                                            %!in%   
17 2       contact      c( 'cellular', 'telephone' )                          %in%    
18 2       month_group1 c( 'aug' )                                            %!in%   
19 2       housing      c( 'no' )                                             %in%    
20 2       education    c( 'secondary' )                                      %!in%   
21 2       education    c( 'tertiary' )                                       %!in%   
22 2       month_group1 c( 'jun', 'may' )                                     %!in%  

我试图删除stringr零件和across-function。也许这对你有用。

example %>% 
  mutate(formulas = gsub("^train %>% filter\\( |\\s*\\)$", "", formulas),
         formulas = strsplit(formulas, "&")) %>% 
  unnest(formulas) %>% 
  mutate(LHS = trimws(gsub("(.*?)(<|>|%in%|%!in%).*", "\\1", formulas, perl = TRUE)),
         RHS = trimws(gsub(".*(<|>|%in%|%!in%)(.*?)", "\\2", formulas, perl = TRUE)),
         operator = trimws(gsub(".*(<|>|%in%|%!in%).*", "\\1", formulas, perl = TRUE))) %>% 
  select(-formulas)

推荐阅读