首页 > 解决方案 > 显示组的打印方法

问题描述

使用dplyr,可以轻松查看数据集所依据的组。如何使用data.table或定义显示它们的打印方法来获取这些组。例子:

df <- read.table(text = "X  Y Goal
32 12 1
                 13 42 0
                 55 33 0", header = TRUE)
setDT(df)
df[,lapply(.SD,sum),.(X,Y)][]

输出:

  X  Y Goal
1: 32 12    1
2: 13 42    0
3: 55 33    0

在 中做类似的事情dplyr可以更直观地显示组,我怎样才能在 中看到相同的内容data.table

更新:确实正如@Gregor 在下面指出的那样,这个summaris(z)e 家庭dyplr会自动取消分组。但是,我怎样才能data.table打印如下所示的内容。

样本dplyr输出。该声明的data.table替代方案# Groups: Species [3]正是我正在寻找的。

 library(dplyr)
  iris %>% 
    group_by(Species) 

    # A tibble: 150 x 5
    # Groups:   Species [3]
   #    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
   #           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
   #  1          5.1         3.5          1.4         0.2 setosa 
   #  2          4.9         3            1.4         0.2 setosa 
   #  3          4.7         3.2          1.3         0.2 setosa 
   #  4          4.6         3.1          1.5         0.2 setosa 

谢谢!

标签: rdplyrdata.table

解决方案


实际上,考虑使用基本 R 的by(面向对象的包装器tapply)使用identityif 目标作为标题读取和 OP 的特定需求如下:

显示组的打印方法

...人们可以轻松查看数据集所依据的组

事实上,您可以运行任何类似的操作,summary并且by输出仍然具有带分节符的组标题!

# ONE GROUP
by(df, random_df$group, identity)

# TWO GROUPS
with(df, by(random_df, list(group1, group2), identity))

# THREE GROUPS
with(df, by(random_df, list(group1, group2, group3), identity))

以下是随机种子数据的演示:

随机数据

set.seed(3222018)
data_tools <- c("sas", "stata", "spss", "python", "r", "julia")

random_df <- data.frame(
  group = sample(data_tools, 50, replace=TRUE),
  int = sample(1:5, 50, replace=TRUE),
  num = rnorm(50),
  char = rep(letters[1:3], 20)[1:50],
  bool = sample(c(TRUE, FALSE), 50, replace=TRUE),
  data = as.Date(sample(1:as.integer(Sys.Date()), 50, replace=TRUE), origin="1970-01-01")  
)

Rextester 演示

一组

by(random_df, random_df$group, identity)

# random_df$group: julia
#    group int         num char  bool       data
# 9  julia   1  0.55209060    c  TRUE 2003-01-08
# 12 julia   5  0.25581573    c FALSE 1973-01-20
# 20 julia   2 -0.44872825    b FALSE 1981-12-20
# 30 julia   1  0.09790121    c  TRUE 1981-07-11
# 31 julia   1  1.75711081    a FALSE 1984-05-23
# 35 julia   3 -1.78797757    b  TRUE 1992-10-23
# 39 julia   5 -1.83548253    c  TRUE 1982-04-18
# 44 julia   3  0.52640304    b FALSE 2016-07-28
# 50 julia   2  0.05759068    b FALSE 1987-07-09
# ------------------------------------------------------------ 
#   random_df$group: python
#     group int         num char  bool       data
# 8  python   3 -0.75471461    b  TRUE 2018-04-08
# 15 python   4  0.03756282    c FALSE 2006-03-29
# 19 python   1  0.43025626    a FALSE 2005-03-30
# 22 python   4 -0.28019319    a FALSE 1997-02-19
# 24 python   2 -0.05318753    c FALSE 1977-08-02
# 45 python   2 -0.11575270    c  TRUE 2006-06-23
# 48 python   2 -0.29651827    c  TRUE 2015-07-13
# 49 python   3 -0.45816745    a FALSE 1991-09-09
# ------------------------------------------------------------ 
#   random_df$group: r
#    group int        num char  bool       data
# 5      r   4 -0.1748250    b FALSE 2005-04-20
# 14     r   2 -0.5868782    b  TRUE 2005-12-30
# 25     r   5 -0.5108906    a  TRUE 1985-08-26
# 33     r   1  0.3580581    c FALSE 1983-08-16
# 34     r   3 -2.9556022    a  TRUE 1998-07-24
# 36     r   5 -2.5008645    c  TRUE 1990-12-30
# 43     r   5 -2.1710319    a  TRUE 1987-09-29
# ------------------------------------------------------------ 
#   ...

两组

with(random_df, by(random_df, list(group, char), identity))

# : julia
# : a
#    group int      num char  bool       data
# 31 julia   1 1.757111    a FALSE 1984-05-23
# ------------------------------------------------------------ 
# : python
# : a
#     group int        num char  bool       data
# 19 python   1  0.4302563    a FALSE 2005-03-30
# 22 python   4 -0.2801932    a FALSE 1997-02-19
# 49 python   3 -0.4581675    a FALSE 1991-09-09
# ------------------------------------------------------------ 
# : r
# : a
#    group int        num char bool       data
# 25     r   5 -0.5108906    a TRUE 1985-08-26
# 34     r   3 -2.9556022    a TRUE 1998-07-24
# 43     r   5 -2.1710319    a TRUE 1987-09-29
# ------------------------------------------------------------ 
# : sas
# : a
#    group int        num char  bool       data
# 28   sas   2 -0.6086176    a FALSE 2019-01-26
# 40   sas   4  0.5408948    a FALSE 2004-10-24
# 46   sas   1  0.7266734    a  TRUE 1996-09-05
# ------------------------------------------------------------ 
# : spss
# : a
#    group int         num char  bool       data
# 1   spss   1 -0.06494476    a  TRUE 1999-07-10
# 7   spss   5  1.93460991    a  TRUE 1974-04-17
# 16  spss   1 -0.55380012    a FALSE 2010-04-06
# 37  spss   1  0.17309091    a  TRUE 1999-01-22
# ------------------------------------------------------------ 
# : stata
# : a
#    group int        num char bool       data
# 4  stata   1 -1.0498222    a TRUE 1981-05-01
# 10 stata   1  1.9223104    a TRUE 1996-07-23
# 13 stata   1 -0.8381546    a TRUE 1989-01-12
# ------------------------------------------------------------ 
# ...

三组

with(random_df, by(random_df, list(group, char, int), identity))

# : julia
# : a
# : 1
#    group int      num char  bool       data
# 31 julia   1 1.757111    a FALSE 1984-05-23
# ------------------------------------------------------------ 
# : python
# : a
# : 1
#     group int       num char  bool       data
# 19 python   1 0.4302563    a FALSE 2005-03-30
# ------------------------------------------------------------ 
# : r
# : a
# : 1
# NULL
# ------------------------------------------------------------ 
# : sas
# : a
# : 1
#    group int       num char bool       data
# 46   sas   1 0.7266734    a TRUE 1996-09-05
# ------------------------------------------------------------ 
# : spss
# : a
# : 1
#    group int         num char  bool       data
# 1   spss   1 -0.06494476    a  TRUE 1999-07-10
# 16  spss   1 -0.55380012    a FALSE 2010-04-06
# 37  spss   1  0.17309091    a  TRUE 1999-01-22
# ------------------------------------------------------------ 
# : stata
# : a
# : 1
#    group int        num char bool       data
# 4  stata   1 -1.0498222    a TRUE 1981-05-01
# 10 stata   1  1.9223104    a TRUE 1996-07-23
# 13 stata   1 -0.8381546    a TRUE 1989-01-12
# ------------------------------------------------------------ 
# ...

数据表示例

为了说明,您还可以在内部运行任何操作,by包括data.table聚合以仍然返回分组标题和分节符打印输出:

library(data.table)
...

by(random_df, random_df$group, function(sub)

  data.table(sub)[, list(mean_num = mean(num),
                         median_num = median(num),
                         min_num = min(num),
                         max_num = max(num),
                         total_num = sum(num)), by = char]
)

# random_df$group: julia
#    char   mean_num median_num   min_num   max_num total_num
# 1:    c -0.2324187  0.1768585 -1.835483 0.5520906 -0.929675
# 2:    b -0.4131780 -0.1955688 -1.787978 0.5264030 -1.652712
# 3:    a  1.7571108  1.7571108  1.757111 1.7571108  1.757111
# ------------------------------------------------------------------------------ 
#   random_df$group: python
#    char   mean_num  median_num    min_num     max_num  total_num
# 1:    b -0.7547146 -0.75471461 -0.7547146 -0.75471461 -0.7547146
# 2:    c -0.1069739 -0.08447011 -0.2965183  0.03756282 -0.4278957
# 3:    a -0.1027015 -0.28019319 -0.4581675  0.43025626 -0.3081044
# ------------------------------------------------------------------------------ 
#   random_df$group: r
#    char   mean_num median_num    min_num    max_num  total_num
# 1:    b -0.3808516 -0.3808516 -0.5868782 -0.1748250 -0.7617031
# 2:    a -1.8791749 -2.1710319 -2.9556022 -0.5108906 -5.6375247
# 3:    c -1.0714032 -1.0714032 -2.5008645  0.3580581 -2.1428064
# ------------------------------------------------------------------------------ 
#   random_df$group: sas
#    char    mean_num  median_num    min_num   max_num  total_num
# 1:    c  0.08215953 0.163064560 -1.3670420 1.3695510  0.3286381
# 2:    b -0.16423225 0.005903355 -1.3995380 0.7308023 -0.6569290
# 3:    a  0.21965022 0.540894826 -0.6086176 0.7266734  0.6589507
# ------------------------------------------------------------------------------ 
#   random_df$group: spss
#    char   mean_num  median_num    min_num    max_num total_num
# 1:    a  0.3722390  0.05407307 -0.5538001  1.9346099  1.488956
# 2:    b  0.5085635  0.73430008 -0.3003130  0.8659667  2.034254
# 3:    c -1.2909541 -1.29095413 -1.2909541 -1.2909541 -1.290954
# ------------------------------------------------------------------------------ 
#   random_df$group: stata
#    char    mean_num median_num    min_num    max_num  total_num
# 1:    a  0.01144453 -0.8381546 -1.0498222  1.9223104  0.0343336
# 2:    b -0.65927260 -0.6592726 -1.1870952 -0.1314500 -1.3185452
# 3:    c  0.94910811  0.9491081  0.9491081  0.9491081  0.9491081

推荐阅读