首页 > 解决方案 > 将一个组与 R 中的其他组作为一个整体进行比较

问题描述

以下是一些示例数据:

movie_df <- data.frame("ID" = c(1,2,3,4,5,6,7,8,9,10),
                        "movie_type" = c("Action", "Horror", "Comedy", "Thriller", "Comedy", 
                                         "Action","Thriller", "Horror", "Action", "Comedy"),
                        "snack_type" = c("Chocolate", "Popcorn", "Candy", "Popcorn", "Popcorn", 
                                         "Candy","Chocolate", "Candy", "Popcorn", "Chocolate"),
                        "event_type" = c("Solo", "Family", "Date", "Friends", "Solo", 
                                         "Family","Date", "Date", "Friends", "Friends"),
                        "total_cost" = c(50, 35, 20, 50, 30,
                                         60, 25, 35, 20, 50))

我想要做的是浏览每一列并将每个组与total_cost. 例如,我想看看与formovie_type == 'Action'相比如何。我想对 in 中的每种类型然后在and中的每种类型执行此操作。movie_type != 'Action'total_costmovie_typesnack_typeevent_type

我最终想要达到的是这个 where sd = Standard Deviation。理想情况下,这将通过tidyverseR 中的方法(例如dplyror tidyr)来完成:

> results_df
# A tibble: 11 x 11
   Group      Grp_1     Grp_2         Grp_1_mean Grp_2_mean Grp_1_sd Grp_2_sd Grp_1_n Grp_2_n Mean_Diff `t-test`
   <chr>      <chr>     <chr>              <dbl>      <dbl>    <dbl>    <dbl>   <dbl>   <dbl>     <dbl>    <dbl>
 1 movie_type Action    Rest of group       43.3       35      20.8      11.5       3       7      8.33    2.84 
 2 movie_type Horror    Rest of group       35         38.1     0        16.0       2       8     -3.12   -2.21 
 3 movie_type Thriller  Rest of group       37.5       37.5    17.7      14.6       2       8      0       0    
 4 movie_type Comedy    Rest of group       33.3       39.3    15.3      14.6       3       7     -5.95   -2.22 
 5 snack_type Chocolate Rest of group       41.7       35.7    14.4      14.8       3       7      5.95    2.26 
 6 snack_type Candy     Rest of group       38.3       37.1    20.2      12.9       3       7      1.19    0.407
 7 snack_type Popcorn   Rest of group       33.8       40      12.5      15.8       4       6     -6.25   -2.60 
 8 event_type Date      Rest of group       26.7       42.1     7.64     14.1       3       7    -15.5    -7.25 
 9 event_type Family    Rest of group       47.5       35      17.7      13.4       2       8     12.5     3.86 
10 event_type Friends   Rest of group       40         36.4    17.3      14.1       3       7      3.57    1.28 
11 event_type Solo      Rest of group       40         36.9    14.1      15.1       2       8      3.12    1.04 

标签: rdplyrtidyversetidyr

解决方案


这与 Daniel 使用 purrr::map 和 purrr::map2 的逻辑相同。

library(dplyr)
library(tibble)
library(purrr)
library(stringr)

needed_cols <- c("movie_type", "snack_type", "event_type")
new_names <- 1:2 %>%
  map(~str_c(c("group", "mean", "sd", "n"), "_", .x)) %>%
  unlist()

my_data <- needed_cols %>%
  map(function(df_c) 
    map(unique(movie_df[[df_c]]), 
        function(v){

          df <- movie_df %>% 
            mutate(group = ifelse(get(df_c) == v, v, "rest_of_group")) %>%
            group_by(group) %>%
            summarize(mean = mean(total_cost), sd = sd(total_cost), n = n()) %>%
            .[match(.$group, c(v, "rest_of_group")),]

          df <- bind_cols(df[1, ], df[2,])
          names(df) <- new_names

          df

        }
    )
  ) %>%
  map2(needed_cols, ~bind_rows(.x) %>% mutate(group = .y)) %>%
  bind_rows() %>%
  select(
    str_subset(names(.), "group") %>% sort(),
    str_subset(names(.), "mean"),
    str_subset(names(.), "sd"),
    str_subset(names(.), "n")
  ) %>%
  mutate(mean_diff = mean_1 - mean_2)

推荐阅读