首页 > 解决方案 > 从全局环境中的对象获取 colnames(具有特定模式),然后返回新的

问题描述

所以我有一堆中间数据框对象,它们在我的全局环境中根据它们的顺序编号。即 IRIS1_St、IRIS2_Db、IRIS1_Start、IRIS2_FIXAR、IRIS4_Change、IRIS10_bananas

我已经弄清楚如何提取这些并返回行数和列数而不保留列表列(见下文),并且从技术上讲我已经提取了列名。但是我一辈子都无法弄清楚如何将这个 colname 列变成不是列表的东西,这样我就可以比较滞后值并返回一个更简单的列来显示新的内容。我尝试了 data.table()、data.frame()、as.character() 和 str_replace_all() 将其转换为向量。但似乎没有任何效果,这似乎是因为我不擅长使用列表!


library(dplyr)
library(purrr)
library(stringr)
IRIS1_St <- iris
IRIS2_Db <- IRIS1_St %>% 
  mutate(Petal.Length2 = Petal.Length*2)

IRIS3_Sum <- IRIS2_Db %>%  
  mutate(Sepal.sum = sum(Sepal.Length, Sepal.Width)) 


IRIS4_Change <- IRIS3_Sum %>% 
  mutate(SL.Change = Sepal.Length - lag(Sepal.Length)) %>%  filter(Petal.Length >=4)


IRIS10_bananas <- IRIS4_Change %>% mutate( bananas = case_when(Sepal.Length >6 ~ "BANANAS!!"))

Obj_Size <- grep("^IRIS",names(.GlobalEnv),value=TRUE) %>%
  na.omit() %>%
  mget(envir = globalenv()) %>%
  {OS <<-.} %>% 
  map_df(nrow) %>% 
  pivot_longer(1:max(ncol(.)), names_to = "Obj_name", values_to = "nrow") %>% 
  left_join(OS %>% 
              map_df(ncol) %>% 
              pivot_longer(1:max(ncol(.)), names_to = "Obj_name", values_to = "ncol")
  ) %>%  
  data.frame(OS %>% 
               lapply(colnames) %>% 
               data.table()) %>% 
  mutate(number = as.numeric(replace_na(str_extract(Obj_name,  "(?i)(?<=IRIS\\D{0,1})\\d+"), 0))) %>% 
  arrange(number, Obj_name) %>% 
  select(-number) %>% data.frame() %>% 
  rename(colnames = '.') 
   
#just to seperate out the colname extraction I've done so far
OST <- OS %>%  lapply(colnames) %>% data.table()

提取新内容我尝试了以下方法,但由于我已经列出了一个事实,所以它搞砸了。

 Obj_Size_New <- Obj_Size %>% 
   mutate(lag_col = as.character(lag(colnames)),
          new_col = setdiff(as.character(colnames), lag_col))

预期输出如下;

  Obj_name        nrow ncol   new_col
1       IRIS1_St  150    5    Sepal.Length, Sepal.Width, Petal.Length, Petal.Width, Species
2       IRIS2_Db  150    6    Petal.Length2
3      IRIS3_Sum  150    7    Sepal.sum
4   IRIS4_Change  89     8    SL.Change
5 IRIS10_bananas  89     9    bananas
                           

我使用下面 akrun 的建议重写了上述内容;

library(dplyr)
library(purrr)
library(stringr)
library(tibble)

Obj_Size <- grep("^IRIS",names(.GlobalEnv),value=TRUE) %>% #ID all objects in GE starting with "IRIS"
  na.omit() %>% 
  mget(envir = globalenv()) %>% #Use base R to get them
{OS <<-.} %>% #create intermediate object in GE to join to later
  map_df(nrow) %>% #Map nrow using purrr
  pivot_longer(1:max(ncol(.)), names_to = "Obj_name", values_to = "nrow") %>% #pivot so it's readable
  left_join(OS %>% #repeat with ncol and join back to dataset
              map_df(ncol) %>%  
              pivot_longer(1:max(ncol(.)), names_to = "Obj_name", values_to = "ncol")
  ) %>%  
left_join(OS %>% #repeat with colnames
               map(colnames) %>% 
              enframe() %>%  #create 2 col dataframe
              rename(Obj_name = name, 
                     colnames = value)) %>% 
  mutate(number = as.numeric(replace_na(str_extract(Obj_name,  "(?i)(?<=IRIS\\D{0,1})\\d+"), 0))) %>% #extract number after IRIS in object name so we can order correctly even when we get to 10 as when ordered by name with 10 it puts it after 1.
  arrange(number, Obj_name) %>%
  select(-number) %>% 
  data.frame() %>% 
  mutate (new_col = map2_chr(colnames, lag(colnames), ~toString(setdiff(.x, .y)))) #Id changes between colnames and only return anything new. 
   

标签: rstringdplyrpurrrstringr

解决方案


由于您有一个列表,colnames您可以使用以下map变体purrr

library(dplyr)
library(purrr)

Obj_Size %>%
  mutate(new_col = map2_chr(colnames, lag(colnames), ~toString(setdiff(.x, .y))))

如下new_col所示:

#                                                        new_col
#1 Sepal.Length, Sepal.Width, Petal.Length, Petal.Width, Species
#2                                                 Petal.Length2
#3                                                     Sepal.sum
#4                                                     SL.Change
#5                                                       bananas

在基础 R 中,您可以使用mapply

Obj_Size$new_col <- mapply(function(x, y) toString(setdiff(x, y)), 
              Obj_Size$colnames, c(NA, Obj_Size$colnames[-nrow(Obj_Size)]))

推荐阅读