r - 提高 R 中不良/可能不必要的 Apply 的性能
问题描述
在此先感谢您的帮助。我不确定我是否使用apply
错误,或者只是破坏了其他减慢代码速度的规则。任何帮助表示赞赏。
概述:我有篮球数据,其中每一行都是篮球比赛中的一个时刻,包括球场上的 10 名球员、他们的球队、比赛,以及该排比赛开始的时间(1-40 分钟)。使用这些数据,我正在计算每位球员在 1 到 40 分钟的每一分钟内他们在场上的球队比赛的百分比。
例如,如果乔的球队打了 20 场比赛,如果在其中的 13 场比赛中,乔在比赛的第 5 分钟被发现在数据中,那么我们会说乔在第 5 分钟被发现在场上,他的 65%球队的比赛。我正在为每个球员、每个赛季、1-40 分钟中的每一分钟计算这个,在我不那么小的数据中,并且遇到了性能问题。这是我目前执行此操作的功能:
library(dplyr)
# Raw Data Is Play-By-Play Data - Each Row contains stats for a pl (combination of 5 basketball players)
sheets_url <- 'https://docs.google.com/spreadsheets/d/1xmzaF6tpzVpjOmgfwHwFM_JE8LUszofjj25A5P0P21o/export?format=csv&id=1xmzaF6tpzVpjOmgfwHwFM_JE8LUszofjj25A5P0P21o&gid=630752085'
on.ct.data <- httr::content(httr::GET(url = sheets_url))
computeOnCourtByMinutePcts <- function(on.ct.data) {
# Create Dataframe With Number Of Games Played By Team Each Season
num.home.team.games <- on.ct.data %>%
dplyr::group_by(homeTeamId, season) %>%
dplyr::summarise(count = length(unique(gameId)))
num.away.team.games <- on.ct.data %>%
dplyr::group_by(awayTeamId, season) %>%
dplyr::summarise(count = length(unique(gameId)))
num.team.games <- num.home.team.games %>%
dplyr::full_join(num.away.team.games, by = c('homeTeamId'='awayTeamId', 'season'='season')) %>%
dplyr::mutate(gamesPlayed = rowSums(cbind(count.x, count.y), na.rm = TRUE)) %>%
dplyr::rename(teamId = homeTeamId) %>%
dplyr::mutate(season = as.character(season)) %>%
dplyr::select(teamId, season, gamesPlayed)
# Create Dataframe With Players By Season - Seems kind of bulky as well
all.player.season.apperances <- rbind(
on.ct.data %>% dplyr::select(homeTeamId, onCtHomeId1, season) %>% dplyr::rename(playerId = onCtHomeId1, teamId = homeTeamId),
on.ct.data %>% dplyr::select(homeTeamId, onCtHomeId2, season) %>% dplyr::rename(playerId = onCtHomeId2, teamId = homeTeamId),
on.ct.data %>% dplyr::select(homeTeamId, onCtHomeId3, season) %>% dplyr::rename(playerId = onCtHomeId3, teamId = homeTeamId),
on.ct.data %>% dplyr::select(homeTeamId, onCtHomeId4, season) %>% dplyr::rename(playerId = onCtHomeId4, teamId = homeTeamId),
on.ct.data %>% dplyr::select(homeTeamId, onCtHomeId5, season) %>% dplyr::rename(playerId = onCtHomeId5, teamId = homeTeamId),
on.ct.data %>% dplyr::select(awayTeamId, onCtAwayId1, season) %>% dplyr::rename(playerId = onCtAwayId1, teamId = awayTeamId),
on.ct.data %>% dplyr::select(awayTeamId, onCtAwayId2, season) %>% dplyr::rename(playerId = onCtAwayId2, teamId = awayTeamId),
on.ct.data %>% dplyr::select(awayTeamId, onCtAwayId3, season) %>% dplyr::rename(playerId = onCtAwayId3, teamId = awayTeamId),
on.ct.data %>% dplyr::select(awayTeamId, onCtAwayId4, season) %>% dplyr::rename(playerId = onCtAwayId4, teamId = awayTeamId),
on.ct.data %>% dplyr::select(awayTeamId, onCtAwayId5, season) %>% dplyr::rename(playerId = onCtAwayId5, teamId = awayTeamId)) %>%
dplyr::distinct(teamId, playerId, season) %>%
dplyr::filter(!is.na(playerId))
# For Each Player-Season, Compute Number Of Games On Court at each minute in game - this is the bad Apply
playing.time.breakdowns <- apply(X = all.player.season.apperances, MARGIN = 1, FUN = function(thisRow) {
# Set Player / Season Variables
thisPlayerId = thisRow[2]
thisSeason = thisRow[3]
# Filter for each unique minute of each game with this player on court
on.court.df = on.ct.data %>%
dplyr::filter(onCtHomeId1 == thisPlayerId | onCtHomeId2 == thisPlayerId | onCtHomeId3 == thisPlayerId | onCtHomeId4 == thisPlayerId | onCtHomeId5 == thisPlayerId |
onCtAwayId1 == thisPlayerId | onCtAwayId2 == thisPlayerId | onCtAwayId3 == thisPlayerId | onCtAwayId4 == thisPlayerId | onCtAwayId5 == thisPlayerId) %>%
dplyr::filter(season == thisSeason) %>%
dplyr::filter(!duplicated(paste0(gameId, minNumIntoGame)))
# Turn This Into a table of minutes on court by game
thisTable <- table(on.court.df$minNumIntoGame)
this.player.distrubution.df <- data.frame(
playerId = thisRow[2],
teamId = thisRow[1],
season = thisRow[3],
minNumIntoGame = as.integer(names(thisTable)),
numGamesAtMinNum = unname(thisTable) %>% as.vector(),
stringsAsFactors = FALSE
)
# 40 minutes in basketball game, so previous dataframe needs 40 rows
if(length(which(!(1:40 %in% this.player.distrubution.df$minNumIntoGame))) > 0) {
zero.mins.played.df <- data.frame(
playerId = thisRow[2],
teamId = thisRow[1],
season = thisRow[3],
minNumIntoGame = which(!(1:40 %in% this.player.distrubution.df$minNumIntoGame)),
numGamesAtMinNum = 0,
stringsAsFactors = FALSE
)
this.player.distrubution.df <- plyr::rbind.fill(this.player.distrubution.df, zero.mins.played.df) %>% dplyr::arrange(minNumIntoGame)
}
# and return
return(this.player.distrubution.df)
})
# Combine the output into one dataframe
playing.time.breakdowns <- playing.time.breakdowns %>% do.call("rbind", .)
# Join on Team-Games played
playing.time.breakdowns <- playing.time.breakdowns %>%
dplyr::left_join(num.team.games, by = c("teamId"="teamId", "season"="season")) %>%
dplyr::rename(teamGamesPlayed = gamesPlayed)
# Compute pct of games played
playing.time.breakdowns <- playing.time.breakdowns %>%
dplyr::mutate(pctMinNumPlayed = round(numGamesAtMinNum / teamGamesPlayed, 3))
# Handle OT (minNumIntoGame > 40) needs a lower gamesPlayed denominator...
# And Return
return(playing.time.breakdowns);
}
on.ct.by.min <- computeOnCourtByMinutePcts(on.ct.data)
总之,代码执行以下操作:
- 创建所有独特球员赛季和球队赛季的初始数据框。对于团队赛季,使用 pbp 数据来计算比赛次数。
- 应用 - 对于每个球员赛季:(a) 找到每场比赛每分钟球员在场上的每个实例(在 10
onCt
列之一中),(b) 将其转换为显示球员比赛次数的表格在 1-40 分钟的每一分钟都在场上。 - 擦亮并返回。将几个表连接在一起,并计算相关百分比。
apply
请注意,通过为一行手动运行该函数可能更容易遵循该函数all.player.season.appearances
。将 thisRow 设置为数据框中的任何行,并逐行运行代码以使代码更加清晰。
为了突出慢代码问题,我将大量的逐场比赛/场上数据上传到谷歌表格,将其公开,并在上面的代码中包含加载数据的链接。谷歌表格有我当前数据的约 1/2,但是我的总数据大小预计在不久的将来会增加 10 倍,并且代码目前需要约 8 分钟才能在我的计算机上运行。这是一个需要每天快速运行的脚本,我无法承受这个功能需要 80 分钟。
感觉我的apply()
调用做得不好,好像不比普通的for循环快。我不确定是否需要 apply,事实上,我认为不需要。但在过去的 24 小时里,我一直在努力思考如何改进这个功能,但没有运气。这里一定有更好的方法!
编辑:我目前正在处理的可重现示例中有一个小错误。Edit2:修复了在数据框中创建 NA 的问题num.team.games
。我刚刚运行了代码,它似乎工作正常。有大约 600 行的输出,其中 teamId 为 NA,这没什么好担心的。
Edit3:看起来应用的每次迭代需要 0.06 秒,并且数据框中有 5312 行,总计约 8 分钟的运行时间。我应该尝试将 0.06 降低到 <0.01,还是放弃整个方法?这是一个我不确定的主要问题...
解决方案
我认为这可以通过将数据转换为长格式并计算球员-分钟-球队-赛季组合来更简单地解决。(从 2008 年开始,在这台旧计算机上运行大约需要 5 秒,并且是大部分计算。)
library(tidyverse)
on.ct.data %>%
gather(spot, name, onCtHomeId1:onCtAwayId5) %>%
mutate(team = if_else(spot %>% str_detect("Away"),
awayTeamId, homeTeamId)) %>%
select(-spot) %>% # For this part, I only care about person and minute of game.
distinct() %>% # Drop dupes and instances where they were repositioned within one minute.
drop_na() %>%
select(-c(gameId:awayTeamId)) %>%
count(minNumIntoGame, name, team, season)
# A tibble: 140,581 x 5
minNumIntoGame name team season n
<dbl> <chr> <chr> <dbl> <int>
1 1 AahmaneSantos387c JAC 1819 1
2 1 AamirSimmseef9 CLEM 1819 13
3 1 AarenEdmead9cd6 NCAT 1718 1
4 1 AarenEdmead9cd6 NCAT 1819 1
5 1 AaronBrennanbee2 IUPU 1718 1
6 1 AaronCalixtea11d OKLA 1819 11
7 1 AaronCarver9cfa ODU 1819 2
8 1 AaronClarke3d67 SHU 1819 1
9 1 AaronFalzon213b NW 1718 1
10 1 AaronHolidayfce6 UCLA 1718 11
现在我们有了它,我们可以检查每个团队的游戏世界是什么样的。每个赛季每支球队在给定的一分钟内打了多少场比赛?
on.ct.data.team.minutes <- on.ct.data.minute.counts %>%
count(season, team, minNumIntoGame, gameId) %>%
count(season, team, minNumIntoGame)
ggplot(on.ct.data.team.minutes %>% slice(1:1000),
aes(minNumIntoGame, team, fill = n)) +
geom_tile() + facet_wrap(~season) +
labs(title = "# times each team played each minute (excerpt)")
...我们可以对每个球员做同样的事情,并与他们的球队进行比较,看看他们为球队效力的每一分钟所占的份额。
# How many games each season did each player play a given minute for each team?
on.ct.data.player.minutes <- on.ct.data.minute.counts %>%
count(season, team, name, minNumIntoGame) %>%
rename(player_n = n) %>%
left_join(on.ct.data.team.minutes) %>%
rename(team_n = n) %>%
mutate(player_time = player_n / team_n)
ggplot(on.ct.data.player.minutes %>% filter(name %>% str_detect("Can")),
aes(minNumIntoGame, player_time, color = name)) +
geom_line() + facet_wrap(~season) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1))
推荐阅读
- laravel - 图片来源不可读-Laravel 5.8
- web3js - 'ReferenceError: web3 未在客户端应用程序上定义
- python - 使用单个网络爬虫以预定义的格式和附件抓取多个网站?
- reactjs - 如何覆盖在 setupFiles (jest.config.js) 中定义的 jest.mock
- python-2.7 - Ruamel.yaml.jinja2: typ "jinja2" 在 CentOS 上无法识别
- c# - 运算符“==”不能应用于 C# 中“方法组”和“字符串”样式 ID 类型的操作数
- python - 为什么在 pytorch 中保存和加载模型权重后得到不同的结果?
- django - 无法在调试 = False(生产)中使用 Django 启动工作进程 Celery 任务
- python - 在 Python 中输出一个 sqlite 记录
- java - 为什么我们需要使用标记接口?