首页 > 解决方案 > 如何重新排列 dplyr 的输出

问题描述

当我写下面的代码

   ddply(milkers, .(dim_cat, lact_cat), function(x) mean(x$milkyield))

我得到以下输出

每个时间段 2 行的初步输出

按库存类别(1 对 2)计算牛奶产量的平均值是正确的。我想最终得到一张更像下面这张的桌子。

每个昏暗时间段一行的所需表格格式

实际上,我正在尝试获取每个时间段内的动物数量并计算它们的平均产奶量。问题是它正在计算所有时间段的动物总数和所有时间段的平均产奶量。

我用来生成此数据的代码如下。

heiferdat <- subset(milkers, lact_cat== 1)
cowdat <- subset(milkers, lact_cat== 2)


ddply(milkers, .(dim_cat), function(x) c(Heifers = sum(milkers$lact_cat==1), H_Milk= mean(heiferdat$milkyield), Cows = sum(milkers$lact_cat==2), C_Milk= mean(cowdat$milkyield)))

我曾预料到,在这段代码中,.(dim_cat) 变量将应用于函数,以限制 sum 和 mean 函数只包括正确时间段内的动物。

我正在寻找有关如何获得每个时间段一行的输出以及每个 lact_cat 类的动物数量和每个 lact_cat 的平均产奶量的建议

谢谢

以下是我正在使用的数据的子集。

dput(milkers[180:200, c(11, 25, 26)]) 
dput(heiferdat[1:20, c(11, 25, 26)])
dput(cowdat[1:20, c(11, 25, 26)])

> dput(milkers[180:200, c(11, 25, 26)]) 
structure(list(milkyield = structure(c(8.42, 38.32, 14.27, 7.68, 
16.59, 17.19, 24.45, 33.47, 36.16, 25.88, 11.61, 18.96, 11.27, 
33.6, 21.57, 20.87, 9.62, 7.93, 21.02, 17.75, 22.01), label = "Milk (L)", class = c("labelled", 
"numeric")), dim_cat = structure(c(5L, 3L, 7L, 7L, 2L, 7L, 2L, 
2L, 2L, 3L, 6L, 6L, 2L, 3L, 6L, 6L, 6L, 6L, 6L, 7L, 6L), .Label = c("<31", 
"31-90", "91-150", "151-210", "211-270", "271-330", ">330"), class = c("labelled", 
"factor"), label = "Days in Milk"), lact_cat = structure(c(2L, 
2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L), .Label = c("1", "2"), class = "factor")), row.names = 180:200, class = "data.frame")

> dput(heiferdat[1:20, c(11, 25, 26)]) 
structure(list(milkyield = structure(c(14.27, 17.19, 11.61, 18.96, 
11.27, 21.57, 20.87, 9.62, 7.93, 21.02, 17.75, 22.01, 25.15, 
11.75, 12.6, 15.62, 19.29, 8.85, 15.52, 11.62), label = "Milk (L)", class = c("labelled", 
"numeric")), dim_cat = structure(c(7L, 7L, 6L, 6L, 2L, 6L, 6L, 
6L, 6L, 6L, 7L, 6L, 6L, 6L, 6L, 7L, 6L, 6L, 6L, 6L), .Label = c("<31", 
"31-90", "91-150", "151-210", "211-270", "271-330", ">330"), class = c("labelled", 
"factor"), label = "Days in Milk"), lact_cat = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = c("1", "2"), class = "factor")), row.names = c(182L, 
185L, 190L, 191L, 192L, 194L, 195L, 196L, 197L, 198L, 199L, 200L, 
201L, 202L, 203L, 204L, 205L, 206L, 207L, 208L), class = "data.frame")

> dput(cowdat[1:20, c(11, 25, 26)]) 
structure(list(milkyield = structure(c(15.73, 14.56, 16.94, 16.25, 
39.09, 9.79, 8.41, 3.05, 38.89, 11.7, 29.89, 19.73, 18.2, 20.63, 
20.32, 52.99, 10.11, 8.08, 10.84, 33.75), label = "Milk (L)", class = c("labelled", 
"numeric")), dim_cat = structure(c(3L, 6L, 6L, 2L, 3L, 7L, 6L, 
7L, 3L, 7L, 3L, 6L, 3L, 6L, 2L, 2L, 7L, 6L, 7L, 7L), .Label = c("<31", 
"31-90", "91-150", "151-210", "211-270", "271-330", ">330"), class = c("labelled", 
"factor"), label = "Days in Milk"), lact_cat = structure(c(2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L), .Label = c("1", "2"), class = "factor")), row.names = c(NA, 
20L), class = "data.frame")

标签: rdplyrmean

解决方案


遵循@DanChaltiel 使用 dplyr 的建议。这是一个 dplyr 方法:

library(dplyr)

all_summary = milkers %>%
  group_by(dim_cat, lact_cat) %>%
  summarise(avg = mean(milkyield),
            num = n())

此时,您已计算出所有摘要信息。以下代码只是格式化/演示。

heifer_summary = all_summary %>%
  filter(lact_cat == 1) %>%
  select(dim_cat, Heifers = num, H_Milk = avg)
cow_summary = all_summary %>%
  filter(lact_cat == 2) %>%
  select(dim_cat, Cows = num, C_Milk = avg)

arranged_summary = full_join(heifer_summary, cow_summary, by = "dim_cat") %>%
  select(dim_cat, Heifers, H_Milk, Cows, C_Milk) %>%
  arrange(dim_cat)

推荐阅读