首页 > 解决方案 > 迭代 group_by()、summarise() 和基于 df 名称的唯一输出名称的最佳方法是什么?

问题描述

我正在尝试学习如何在不使用循环的情况下进行迭代,并且apply()软件包map()系列和软件包系列都让我感到非常困惑。

下面是一些示例代码,它们是我正在使用的数据的通用版本。我从一个跨越数月或数年的大型数据集开始。我必须将它过滤到感兴趣的特定时间段,然后我将单独的汇总表吐出到 .xlsx 的每一页,以对报告进行比较。

这段代码的真实版本工作得非常好,但是我要创建的汇总/比较表比这个示例要多得多,所以我想找出在功能上做到这一点的最佳方法。我已经使用 r 大约一年半了,并且可以做很多事情,但是让代码更高效仍然有点超出我的想象。

此外,我不太清楚如何迭代工作表的输出名称......或者我想运行多个迭代以基于 df 名称输出唯一文件名的任何情况。

这是我在这里的第一篇文章,所以我提前为任何格式/协议问题道歉。

library(tidyverse)
library(openxlsx)
library(janitor)
library(tibble)
library(tidyr)
library(dplyr)
library(lubridate)


rm(list=ls())

data <- tribble(
  ~Category, ~Date,        ~Area,     ~Count,
  "Fight",   "2021-01-04", "Area 1",  2,
  "Hug",     "2021-02-01", "Area 2",  4,
  "Dance",   "2021-03-21", "Area 3",  6,
  "Sleep",   "2021-04-18", "Area 3",  8,
  "Hug",     "2021-05-06", "Area 2",  2,
  "Dance",   "2021-06-30", "Area 1",  4,
  "Sleep",   "2021-01-29", "Area 1",  6,
  "Fight",   "2021-02-28", "Area 2",  8,
  "Dance",   "2021-03-15", "Area 3",  2,
  "Sleep",   "2021-04-08", "Area 3",  4,
  "Fight",   "2021-05-30", "Area 2",  6,
  "Hug",     "2021-06-10", "Area 1",  8
  
)

# define and order categorical variables

areas <- c("Area 1", "Area 2","Area 3")
cats <- c("Fight", "Hug", "Dance","Sleep")
data$Area <- factor(data$Area, levels = areas, ordered = TRUE)
data$Category <- factor(data$Category, levels = cats, ordered = TRUE)

# Filter and Summarise
this_14 <- filter(data, ymd(data$Date) >= ymd("2021-06-30")-13 & ymd(data$Date) <= ymd("2021-06-30")) %>%
  group_by(Category,Area,.drop = FALSE) %>%
  summarise(total = sum(Count)) %>%
  pivot_wider(names_from = `Area`,values_from = total) %>%
  adorn_totals(where=c("row","col"))
this_14[is.na(this_14)] <- 0

last_14 <- filter(data, ymd(data$Date) >= ymd("2021-06-30")-27 & ymd(data$Date) <= ymd("2021-06-30")-14) %>%
  group_by(Category,Area,.drop = FALSE) %>%
  summarise(total = sum(Count)) %>%
  pivot_wider(names_from = `Area`,values_from = total) %>%
  adorn_totals(where=c("row","col"))
last_14[is.na(last_14)] <- 0

prev_14 <- filter(data, ymd(data$Date) >= ymd("2021-06-30")-41 & ymd(data$Date) <= ymd("2021-06-30")-28) %>%
  group_by(Category,Area,.drop = FALSE) %>%
  summarise(total = sum(Count)) %>%
  pivot_wider(names_from = `Area`,values_from = total) %>%
  adorn_totals(where=c("row","col"))
prev_14[is.na(prev_14)] <- 0  

# Create Workbook
file <- paste("Dev/output_tables_eff.xlsx", sep="/")
wb <- createWorkbook()

# Define Sheets
sheet1 <- addWorksheet(wb, "this_14")
sheet2 <- addWorksheet(wb, "last_14")
sheet3 <- addWorksheet(wb, "prev_14")


# Set Dataframes to export
writeData(wb, sheet = sheet1,this_14)
writeData(wb, sheet = sheet2,last_14)
writeData(wb, sheet = sheet3,prev_14)

saveWorkbook(wb, file = file, overwrite = TRUE)

标签: r

解决方案


这是一个使用函数创建汇总表并将它们作为选项卡保存到工作簿的选项。我使用purrr::map2并提供了 Excel 工作表的开始日期列表(例如 0、-14、-28)和名称列表。

library(tidyverse)
library(openxlsx)
library(janitor)
library(lubridate)

create_table <- function(start_date, sheet_name, df, wb) {
  
  df_out <- df %>%
    mutate(Date = ymd(Date)) %>%
    filter(Date >= ymd("2021-06-30") + (start_date - 13) & Date <= ymd("2021-06-30") + start_date) %>%
    group_by(Category, Area) %>%
    summarise(total = sum(Count)) %>%
    pivot_wider(names_from = `Area`, values_from = total) %>%
    adorn_totals(where = c("row", "col")) %>%
    replace(is.na(.), 0)
  
  
  addWorksheet(wb, sheet_name)
  writeData(wb, sheet = sheet_name, df_out)

} 

wb <- createWorkbook()

map2(c(0, -14, -28), # desired start dates
     c('this_14', 'last_14', 'prev_14'), # names for the Excel tabs
     create_table, 
     df = data, 
     wb = wb)

saveWorkbook(wb, file = "output_tables_eff.xlsx", overwrite = TRUE)

数据

data <- tribble(
  ~Category, ~Date,        ~Area,     ~Count,
  "Fight",   "2021-01-04", "Area 1",  2,
  "Hug",     "2021-02-01", "Area 2",  4,
  "Dance",   "2021-03-21", "Area 3",  6,
  "Sleep",   "2021-04-18", "Area 3",  8,
  "Hug",     "2021-05-06", "Area 2",  2,
  "Dance",   "2021-06-30", "Area 1",  4,
  "Sleep",   "2021-01-29", "Area 1",  6,
  "Fight",   "2021-02-28", "Area 2",  8,
  "Dance",   "2021-03-15", "Area 3",  2,
  "Sleep",   "2021-04-08", "Area 3",  4,
  "Fight",   "2021-05-30", "Area 2",  6,
  "Hug",     "2021-06-10", "Area 1",  8
  
)

# define and order categorical variables

areas <- c("Area 1", "Area 2","Area 3")
cats <- c("Fight", "Hug", "Dance","Sleep")
data$Area <- factor(data$Area, levels = areas, ordered = TRUE)
data$Category <- factor(data$Category, levels = cats, ordered = TRUE)

推荐阅读