首页 > 解决方案 > 如何按变量汇总保留名称

问题描述

这里的示例和不言自明的案例与所需的解决方案:

 set.seed(123)
 df <- data.frame(s=rnorm(100), g1=seq(1,100,,100),g2=seq(5,100,,100),g3=seq(10,1,,100))
 agg <- with(df, aggregate(s, data.frame(g1,g2,g3), 
                 FUN = function(x) c("mean" = mean(x), "median" = median(x))))
 head(agg,5)

       g1        g2       g3     x.mean   x.median
1 100 100.00000 1.000000 -1.0264209 -1.0264209
2  99  99.04040 1.090909 -0.2357004 -0.2357004
3  98  98.08081 1.181818  1.5326106  1.5326106
4  97  97.12121 1.272727  2.1873330  2.1873330
5  96  96.16162 1.363636 -0.6002596 -0.6002596

期望的结果(-xxx 只是计算的数值或按组计算的数值):

  g1   g2       g3        g1.x.mean  g1.x.median g2.x.mean  g2.x.median g3.x.mean  g3.x.median
1 100 100.00000 1.000000 -1.0264209 -1.0264209   -xxx       -xxxx        -xxxx       -xxxx
2  99  99.04040 1.090909 -0.2357004 -0.2357004   -xxx       -xxxx        -xxxx       -xxxx
3  98  98.08081 1.181818  1.5326106  1.5326106   -xxx       -xxxx        -xxxx       -xxxx
4  97  97.12121 1.272727  2.1873330  2.1873330   -xxx       -xxxx        -xxxx       -xxxx
5  96  96.16162 1.363636 -0.6002596 -0.6002596   -xxx       -xxxx        -xxxx       -xxxx

标签: r

解决方案


如果分组应该是分开的,我们可以使用lapply循环'g'列,aggregate分别应用并将它们与cbind

nm1 <- paste0('g', 1:3)
out1 <- do.call(cbind, lapply(nm1, function(g)       {
   out <- do.call(data.frame, aggregate(s ~ .,  df[c('s', g)],
        FUN = function(x) c(mean(x), median(x))))
    names(out)[-1] <- paste0(g, c('.x.mean', '.x.median'))
   out}))

 out1 <-  out1[c(nm1, setdiff(names(out1), nm1))]

-输出

head(out1)
#  g1       g2       g3   g1.x.mean g1.x.median   g2.x.mean g2.x.median  g3.x.mean g3.x.median
#1  1 5.000000 1.000000 -0.56047565 -0.56047565 -0.56047565 -0.56047565 -1.0264209  -1.0264209
#2  2 5.959596 1.090909 -0.23017749 -0.23017749 -0.23017749 -0.23017749 -0.2357004  -0.2357004
#3  3 6.919192 1.181818  1.55870831  1.55870831  1.55870831  1.55870831  1.5326106   1.5326106
#4  4 7.878788 1.272727  0.07050839  0.07050839  0.07050839  0.07050839  2.1873330   2.1873330
#5  5 8.838384 1.363636  0.12928774  0.12928774  0.12928774  0.12928774 -0.6002596  -0.6002596
#6  6 9.797980 1.454545  1.71506499  1.71506499  1.71506499  1.71506499  1.3606524   1.3606524

或者如 中所述tidyverse,如果每个组的unique组数不同,则一种方法是ave创建列然后应用unique

out1 <- unique(cbind(df[nm1], do.call(cbind, lapply(nm1, function(g) {
                 mean <- with(df, ave(s, df[[g]]))
                 median <- with(df, ave(s, df[[g]], FUN = median))
                 setNames(data.frame(mean, median),
                       paste0(g, c('.x.mean', '.x.median')))
                       
                       }))))

-输出

head(out1)
#  g1       g2        g3   g1.x.mean g1.x.median   g2.x.mean g2.x.median   g3.x.mean g3.x.median
#1  1 5.000000 10.000000 -0.56047565 -0.56047565 -0.56047565 -0.56047565 -0.56047565 -0.56047565
#2  2 5.959596  9.909091 -0.23017749 -0.23017749 -0.23017749 -0.23017749 -0.23017749 -0.23017749
#3  3 6.919192  9.818182  1.55870831  1.55870831  1.55870831  1.55870831  1.55870831  1.55870831
#4  4 7.878788  9.727273  0.07050839  0.07050839  0.07050839  0.07050839  0.07050839  0.07050839
#5  5 8.838384  9.636364  0.12928774  0.12928774  0.12928774  0.12928774  0.12928774  0.12928774
#6  6 9.797980  9.545455  1.71506499  1.71506499  1.71506499  1.71506499  1.71506499  1.71506499                           

或使用tidyerse

library(dplyr) # version >= 1.0
library(purrr)
library(stringr)
map_dfc(nm1, ~ df %>% 
      group_by(across(all_of(.x))) %>%
      summarise(!! str_c(.x, ".x.mean") := mean(s),
        !! str_c(.x, ".x.median") := median(s), .groups = 'drop')) %>% 
    select(all_of(nm1), everything())

-输出

# A tibble: 100 x 9
#      g1    g2    g3 g1.x.mean g1.x.median g2.x.mean g2.x.median g3.x.mean g3.x.median
#   <dbl> <dbl> <dbl>     <dbl>       <dbl>     <dbl>       <dbl>     <dbl>       <dbl>
# 1     1  5     1      -0.560      -0.560    -0.560      -0.560     -1.03       -1.03 
# 2     2  5.96  1.09   -0.230      -0.230    -0.230      -0.230     -0.236      -0.236
# 3     3  6.92  1.18    1.56        1.56      1.56        1.56       1.53        1.53 
# 4     4  7.88  1.27    0.0705      0.0705    0.0705      0.0705     2.19        2.19 
# 5     5  8.84  1.36    0.129       0.129     0.129       0.129     -0.600      -0.600
# 6     6  9.80  1.45    1.72        1.72      1.72        1.72       1.36        1.36 
# 7     7 10.8   1.55    0.461       0.461     0.461       0.461     -0.628      -0.628
# 8     8 11.7   1.64   -1.27       -1.27     -1.27       -1.27       0.239       0.239
# 9     9 12.7   1.73   -0.687      -0.687    -0.687      -0.687      0.548       0.548
#10    10 13.6   1.82   -0.446      -0.446    -0.446      -0.446      0.994       0.994
# … with 90 more rows

如果分组列具有不同数量的唯一元素,则可以选择mutate/transmute新列,然后distinct在最后

map_dfc(nm1, ~ df %>% 
      group_by(across(all_of(.x))) %>%
      transmute(!! str_c(.x, ".x.mean") := mean(s),
        !! str_c(.x, ".x.median") := median(s))) %>% 
    select(all_of(nm1), everything()) %>%
    ungroup %>%
    distinct

-输出

# A tibble: 100 x 9
#      g1    g2    g3 g1.x.mean g1.x.median g2.x.mean g2.x.median g3.x.mean g3.x.median
#   <dbl> <dbl> <dbl>     <dbl>       <dbl>     <dbl>       <dbl>     <dbl>       <dbl>
# 1     1  5    10      -0.560      -0.560    -0.560      -0.560    -0.560      -0.560 
# 2     2  5.96  9.91   -0.230      -0.230    -0.230      -0.230    -0.230      -0.230 
# 3     3  6.92  9.82    1.56        1.56      1.56        1.56      1.56        1.56  
# 4     4  7.88  9.73    0.0705      0.0705    0.0705      0.0705    0.0705      0.0705
# 5     5  8.84  9.64    0.129       0.129     0.129       0.129     0.129       0.129 
# 6     6  9.80  9.55    1.72        1.72      1.72        1.72      1.72        1.72  
# 7     7 10.8   9.45    0.461       0.461     0.461       0.461     0.461       0.461 
# 8     8 11.7   9.36   -1.27       -1.27     -1.27       -1.27     -1.27       -1.27  
# 9     9 12.7   9.27   -0.687      -0.687    -0.687      -0.687    -0.687      -0.687 
#10    10 13.6   9.18   -0.446      -0.446    -0.446      -0.446    -0.446      -0.446 
# … with 90 more rows

推荐阅读