r - 如何按两个字段分组并按日期计算每种类型的数据?
问题描述
我有一个数据集(前 100 行):
structure(list(department = structure(c(21L, 14L, 4L, 11L, 21L,
12L, 15L, 11L, 3L, 18L, 4L, 20L, 25L, 3L, 3L, 13L, 19L, 22L,
18L, 16L, 16L, 16L, 16L, 4L, 20L, 12L, 4L, 27L, 1L, 6L, 16L,
1L, 13L, 13L, 25L, 18L, 8L, 23L, 10L, 16L, 4L, 21L, 2L, 5L, 18L,
10L, 23L, 4L, 7L, 5L, 14L, 15L, 19L, 23L, 11L, 4L, 15L, 6L, 12L,
11L, 23L, 14L, 15L, 11L, 18L, 24L, 27L, 27L, 20L, 5L, 1L, 19L,
4L, 10L, 4L, 26L, 3L, 14L, 15L, 12L, 22L, 14L, 20L, 25L, 2L,
23L, 15L, 13L, 4L, 18L, 26L, 13L, 5L, 10L, 1L, 6L, 10L, 22L,
5L, 14L), .Label = c("Beauty", "Boutique advisor", "Boutique advisors",
"Boutique Stylist", "Clean Beauty Expert", "Conseiller en boutique",
"Design Consultant", "Designer Trade Specialist", "Food", "Furniture",
"In-store Design Expert", "In-store experts", "In-Store Sales Professional",
"In-Store Style Experts", "John Hardy", "Jos. A. Bank LIVE!",
"Levi's Stylists", "Lighting & Home Accessories", "Men's Wearhouse LIVE!",
"Menswear", "Personal advisors", "Styliste en boutique", "Vendeurs",
"Wine", "Women's Accessories", "Women's shoes", "Womenswear"), class = "factor"),
type = c("Completed", "Missed", "Missed", "Missed", "Missed",
"Missed", "Missed", "Completed", "Completed", "Missed", "Missed",
"Completed", "Completed", "Completed", "Completed", "Completed",
"Completed", "Completed", "Completed", "Missed", "Completed",
"Missed", "Completed", "Missed", "Missed", "Completed", "Missed",
"Missed", "Missed", "Completed", "Missed", "Completed", "Missed",
"Completed", "Missed", "Missed", "Completed", "Missed", "Missed",
"Completed", "Completed", "Missed", "Completed", "Missed",
"Completed", "Missed", "Missed", "Completed", "Missed", "Completed",
"Completed", "Missed", "Completed", "Missed", "Completed",
"Completed", "Missed", "Missed", "Missed", "Missed", "Completed",
"Missed", "Completed", "Completed", "Completed", "Missed",
"Missed", "Completed", "Missed", "Completed", "Completed",
"Missed", "Completed", "Completed", "Missed", "Missed", "Completed",
"Completed", "Completed", "Completed", "Missed", "Completed",
"Completed", "Completed", "Completed", "Completed", "Completed",
"Completed", "Completed", "Completed", "Completed", "Missed",
"Missed", "Completed", "Completed", "Completed", "Missed",
"Completed", "Missed", "Completed"), date = structure(c(17889,
17890, 17893, 17893, 17892, 17892, 17893, 17893, 17892, 17888,
17892, 17889, 17888, 17893, 17888, 17889, 17891, 17892, 17893,
17891, 17889, 17888, 17892, 17889, 17889, 17892, 17888, 17889,
17893, 17892, 17893, 17892, 17891, 17893, 17888, 17891, 17892,
17891, 17892, 17888, 17891, 17893, 17893, 17892, 17890, 17888,
17888, 17889, 17891, 17893, 17893, 17890, 17890, 17892, 17889,
17892, 17889, 17889, 17888, 17888, 17893, 17893, 17893, 17891,
17888, 17892, 17892, 17893, 17891, 17888, 17889, 17891, 17889,
17890, 17891, 17888, 17889, 17888, 17890, 17893, 17889, 17889,
17893, 17889, 17892, 17891, 17889, 17892, 17888, 17891, 17893,
17890, 17890, 17889, 17893, 17889, 17889, 17888, 17889, 17892
), class = "Date"), count = c(7L, 9L, 8L, 3L, 5L, 4L, 5L,
10L, 1L, 3L, 5L, 18L, 3L, 7L, 1L, 17L, 277L, 10L, 14L, 50L,
520L, 92L, 791L, 6L, 7L, 4L, 2L, 1L, 3L, 3L, 145L, 17L, 10L,
42L, 1L, 1L, 1L, 2L, 7L, 627L, 3L, 6L, 4L, 3L, 3L, 2L, 1L,
2L, 1L, 20L, 41L, 4L, 283L, 1L, 14L, 5L, 2L, 1L, 3L, 3L,
7L, 12L, 36L, 9L, 14L, 1L, 6L, 13L, 1L, 14L, 12L, 16L, 3L,
2L, 6L, 7L, 4L, 21L, 3L, 5L, 5L, 22L, 12L, 5L, 1L, 5L, 23L,
36L, 13L, 12L, 12L, 9L, 4L, 6L, 6L, 4L, 1L, 4L, 1L, 32L)), row.names = c(NA,
100L), class = "data.frame")
我需要它看起来像这样(按部门(行)和每天每种类型的相应计数(列)分组):
目前,我有两种方法来解决这个问题,这两种方法都不能产生预期的结果,但我怀疑我很接近,因为解决方案似乎介于两者之间。
第一种方法:
library(dplyr) # For the purpose of this reproducible example should you need it
dept %>%
group_by(
department
) %>%
summarise(
missed = sum(type == "Missed"),
completed = sum(type == "Completed"),
missed_pct = missed / (missed + completed)
)
这给了我这个:
# A tibble: 7 x 4
department missed completed missed_pct
<fct> <int> <int> <dbl>
1 Beauty 2 5 0.286
2 Food 0 1 0
3 Menswear 4 6 0.4
4 Wine 1 1 0.5
5 Women's Accessories 2 5 0.286
6 Women's shoes 3 5 0.375
7 Womenswear 4 5 0.444
第二种方法:
library(dplyr) # For the purpose of this reproducible example should you need it
dept %>%
group_by(
department,
date
) %>%
summarise(
missed = sum(type == "Missed"),
completed = sum(type == "Completed"),
missed_pct = missed / (missed + completed)
)
这给了我这个:
# A tibble: 28 x 5
# Groups: department [?]
department date missed completed missed_pct
<fct> <date> <int> <int> <dbl>
1 Beauty 2018-12-23 0 1 0
2 Beauty 2018-12-24 0 1 0
3 Beauty 2018-12-26 0 1 0
4 Beauty 2018-12-27 1 1 0.5
5 Beauty 2018-12-28 1 1 0.5
6 Food 2018-12-27 0 1 0
7 Menswear 2018-12-23 1 1 0.5
8 Menswear 2018-12-24 1 1 0.5
9 Menswear 2018-12-25 0 1 0
10 Menswear 2018-12-26 1 1 0.5
我怎样才能做到这一点?
解决方案
使用dplyr
:
library(dplyr)
dept %>%
group_by(department, date, type) %>%
summarise(count = sum(count, na.rm = T)) %>% # data had a few duplicate rows
spread(type, count, fill = 0) %>%
as.data.frame() %>%
group_by(department, date) %>%
mutate(missed_pct = Missed / (Missed + Completed)) %>%
melt(id.vars = c("department", "date")) %>%
spread(date, value) %>%
rename(type = variable)
让我知道您是否希望对此进行解释。基本上只是多次使用 spread 和 melt 来确保数据在每个阶段都按照我们想要的方式构建(我建议运行到每个 melt/spread 并注意 data.frame 的形状。)
推荐阅读
- powershell - 隔离数组中的相似文件名以进行额外处理
- php - Yii2 使用任何用户名和密码发送电子邮件
- python - python中sha256时随机十六进制字符串得到'none'
- python - 将列表插入到具有给定索引的二维数组中
- python - 如何从元组列表中删除重复项?
- javascript - 我无法使用 jspdf 为我的 pdf 渲染图像
- javascript - 组件事件上未调用 React.js 函数
- android - 如何通过放置在项目的 app 文件夹中来访问 google 凭据 json 文件以及如何将该路径传递给 GoogleCredentials
- python-3.x - 无法从 HTML BeautifulSoup 对象中提取 url
- c++ - 从特定键开始迭代地图