首页 > 解决方案 > 如何使用 dplyr 对另一个表中的数据进行计算?

问题描述

我正在做一些研究,让我在几个单独的df. 分离的结果df将用于一项主要计算df。大多数统计类别用于几个df. 我希望使用单独的结果df


# Table used for league average calculations below
teams <- tibble::tribble(
  ~Team,  ~PA,  ~AB,   ~H, ~HBP, ~BB, ~HR,  ~R, ~RBI, ~SB,  ~AVG,  ~OBP,
  "Athletics", 6255, 5579, 1407,   76, 550, 227, 813,  778,  35, 0.252, 0.325,
  "Red Sox", 6302, 5623, 1509,   55, 569, 208, 876,  829, 125, 0.268, 0.339,
  "Yankees", 6271, 5515, 1374,   62, 625, 267, 851,  821,  63, 0.249, 0.329,
  "Indians", 6300, 5595, 1447,   80, 554, 216, 818,  786, 135, 0.259, 0.332,
  "Astros", 6146, 5453, 1390,   61, 565, 205, 797,  763,  71, 0.255, 0.329
)


# Table used for player calculations (main table)
players <- tibble::tribble(
  ~Name,     ~Team,  ~G, ~PA, ~AB,  ~H, ~HBP, ~BB, ~HR,  ~R, ~RBI, ~SB,  ~AVG,  ~OBP,
  "Mookie Betts", "Red Sox", 136, 614, 520, 180,    8,  81,  32, 129,   80,  30, 0.346, 0.438,
  "Mike Trout",  "Angels", 140, 608, 471, 147,   10, 122,  39, 101,   79,  24, 0.312,  0.46,
  "J.D. Martinez", "Red Sox", 150, 649, 569, 188,    4,  69,  43, 111,  130,   6,  0.33, 0.402,
  "Alex Bregman",  "Astros", 157, 705, 594, 170,   12,  96,  31, 105,  103,  10, 0.286, 0.394,
  "Jose Ramirez", "Indians", 157, 698, 578, 156,    8, 106,  39, 110,  105,  34,  0.27, 0.387
)


# Denominators needed for calculations
calc_tbl <- tibble::tribble(
  ~data_col, ~calc_denom,
  "HR",        14.3,
  "R",        19.6,
  "RBI",        17.5,
  "SB",        26.2,
  "AVG",      0.0045,
  "OBP",      0.0031
) %>% 
  spread(key = data_col, value = "calc_denom")


# Get league average of teams
lg_avg <- teams %>% 
# Divide counting stats by 10 to get the averages for 10 batters
  mutate_at(vars(PA:SB), funs(./10)) %>% 
  summarize_if(is.numeric, mean, na.rm=TRUE)

lg_avg
#> # A tibble: 1 x 11
#>      PA    AB     H   HBP    BB    HR     R   RBI    SB   AVG   OBP
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  625.  555.  143.  6.68  57.3  22.5  83.1  79.5  8.58 0.257 0.331

# Calculate Values
value_tbl <- players %>% 
  mutate(calc_R = R / calc_tbl$R,
         calc_HR = HR / calc_tbl$HR,
         calc_RBI = RBI / calc_tbl$RBI,
         calc_SB = SB / calc_tbl$SB,
         calc_BA = (((lg_avg$H * 13 ) + H)/(AB + (lg_avg$AB * 13)) - lg_avg$AVG) / calc_tbl$AVG,
         calc_Total = (calc_R + calc_HR + calc_RBI + calc_SB + calc_BA))

我真的有两个问题,都关注效率,以及是否有更好的方法来做我正在做的事情。我df是否正确调用其他列计算的结果?
有没有更直接、更有效的方法来编写变异的最后一段代码?

标签: rdplyrtidyverse

解决方案


看看这是否适合你(使用data.table):

# load packages
library(data.table)

# Table used for league average calculations below ----
teams <- tibble::tribble(
  ~Team,  ~PA,  ~AB,   ~H, ~HBP, ~BB, ~HR,  ~R, ~RBI, ~SB,  ~AVG,  ~OBP,
  "Athletics", 6255, 5579, 1407,   76, 550, 227, 813,  778,  35, 0.252, 0.325,
  "Red Sox", 6302, 5623, 1509,   55, 569, 208, 876,  829, 125, 0.268, 0.339,
  "Yankees", 6271, 5515, 1374,   62, 625, 267, 851,  821,  63, 0.249, 0.329,
  "Indians", 6300, 5595, 1447,   80, 554, 216, 818,  786, 135, 0.259, 0.332,
  "Astros", 6146, 5453, 1390,   61, 565, 205, 797,  763,  71, 0.255, 0.329
)
setDT(teams) # set df as data.table

# Table used for player calculations (main table) -----
players <- tibble::tribble(
  ~Name,     ~Team,  ~G, ~PA, ~AB,  ~H, ~HBP, ~BB, ~HR,  ~R, ~RBI, ~SB,  ~AVG,  ~OBP,
  "Mookie Betts", "Red Sox", 136, 614, 520, 180,    8,  81,  32, 129,   80,  30, 0.346, 0.438,
  "Mike Trout",  "Angels", 140, 608, 471, 147,   10, 122,  39, 101,   79,  24, 0.312,  0.46,
  "J.D. Martinez", "Red Sox", 150, 649, 569, 188,    4,  69,  43, 111,  130,   6,  0.33, 0.402,
  "Alex Bregman",  "Astros", 157, 705, 594, 170,   12,  96,  31, 105,  103,  10, 0.286, 0.394,
  "Jose Ramirez", "Indians", 157, 698, 578, 156,    8, 106,  39, 110,  105,  34,  0.27, 0.387
)
setDT(players) # set df as data.table

# Denominators needed for calculations----
calc_tbl <- tibble::tribble(
  ~data_col, ~calc_denom,
  "HR",        14.3,
  "R",        19.6,
  "RBI",        17.5,
  "SB",        26.2,
  "AVG",      0.0045,
  "OBP",      0.0031
)
setDT(calc_tbl) # set df as data.table

# Get league average of teams ----
lg_avg <- teams[, lapply(.SD, mean, na.rm = T), .SDcols = which(sapply(teams, is.numeric))] # summarize by columns that are numeric

# Calculate Values
cols <- names(players)[-c(1:2)] # assign an object with the column names to be calculated

res <- copy(players) # optional: I am making a copy of "players" because the `:=` operator changes the data by reference. If don't need to preserve the players table, then you don't need to make a copy, replace res in the loop and in the data.table::melt(...) expression by "players".

    for(i in cols){
  if(i == "OBP"){
    res[, (i) := (lg_avg$OBP * lg_avg$PA) + H + BB]
    next
  }
  res[, (i) := lapply(.SD, function(x) {
    if(is.null(lg_avg[[i]])) return(NA)
    return(x/lg_avg[[i]])
    }), .SDcols= i]
}

res <- data.table::melt(res, id.vars = c(1:2), variable.name = "stat_value", value.name = "calc_column")

部分结果:

> head(res)
            Name    Team stat_value calc_column
1:  Mookie Betts Red Sox          G          NA
2:    Mike Trout  Angels          G          NA
3: J.D. Martinez Red Sox          G          NA
4:  Alex Bregman  Astros          G          NA
5:  Jose Ramirez Indians          G          NA
6:  Mookie Betts Red Sox         PA  0.09816461

推荐阅读