r - R:使循环更“高效”
问题描述
我正在使用 R。我在此处编写了以下循环(使用一些随机创建的数据),该循环遍历了一些数据操作步骤并生成了一个名为 "final_results" 的所需表:
#load library
library(dplyr)
library(data.table)
set.seed(123)
# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,100,5)
c1 = sample.int(1000, 1000, replace = TRUE)
train_data = data.frame(a1,b1,c1)
####
results_table <- data.frame()
for (i in 1:10 ) {
#generate random numbers
random_1 = runif(1, 80, 120)
random_2 = runif(1, random_1, 120)
random_3 = runif(1, 85, 120)
random_4 = runif(1, random_3, 120)
#bin data according to random criteria
train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3, "a", ifelse(a1 <= random_2 & b1 <= random_4, "b", "c")))
train_data$cat = as.factor(train_data$cat)
#new splits
a_table = train_data %>%
filter(cat == "a") %>%
select(a1, b1, c1, cat)
b_table = train_data %>%
filter(cat == "b") %>%
select(a1, b1, c1, cat)
c_table = train_data %>%
filter(cat == "c") %>%
select(a1, b1, c1, cat)
split_1 = runif(1,0, 1)
split_2 = runif(1, 0, 1)
split_3 = runif(1, 0, 1)
#calculate 60th quantile ("quant") for each bin
table_a = data.frame(a_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_1)))
table_b = data.frame(b_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_2)))
table_c = data.frame(c_table%>% group_by(cat) %>%
mutate(quant = quantile(c1, prob = split_3)))
#create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
table_b$diff = ifelse(table_b$quant > table_b$c1,1,0)
table_c$diff = ifelse(table_c$quant > table_c$c1,1,0)
#group all tables
final_table = rbind(table_a, table_b, table_c)
#create a table: for each bin, calculate the average of "diff"
final_table_2 = data.frame(final_table %>%
group_by(cat) %>%
summarize(
mean = mean(diff)
))
#add "total mean" to this table
final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total", mean = mean(final_table$diff)))
#format this table: add the random criteria to this table for reference
final_table_2$random_1 = random_1
final_table_2$random_2 = random_2
final_table_2$random_3 = random_3
final_table_2$random_4 = random_4
final_table_2$split_1 = split_1
final_table_2$split_2 = split_2
final_table_2$split_3 = split_3
final_table_2$iteration_number = i
results_table <- rbind(results_table, final_table_2)
final_results = dcast(setDT(results_table), iteration_number + random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat, value.var = 'mean')
}
在上面的代码中,我运行了 10 次循环。将来,我会对运行这个循环大约 1,000,000 次感兴趣。除了购买更强大的计算机之外,是否有可能重新编写这段代码以减少计算机处理的“繁重”?可以通过以不同的方式存储中间步骤来提高此代码的效率吗?有什么办法可以加快这段代码的运行时间吗?
谢谢
解决方案
好吧,您可以尝试摆脱这些 ifelse() 行。例如
#table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
table_a <- 0L
table_a[table_a$quant > table_a$c1, "diff"] <- 1L
这同样适用:
#a_table <- train_data %>%
# filter(cat == "a") %>%
# select(a1, b1, c1, cat)
a_table <- train_data[train_data$cat == "a", c("a1","b1","c1","cat")]
它使代码更加混乱,但与 dplyr 等效项相比,基本操作通常更快。
推荐阅读
- php - 在 MySQL Community Server 8.0.22 上更新 JSON 列时 JSON 文本无效
- angular-cli - 无法安装最新的 Angular 版本
- php - 以英国格式显示数据库中的时间日期的问题
- python - psycopg2:如何通过 VARIADIC ARGS?
- python - 在模板中的 href 标记内使用 django 模型的字段值
- ios - 在 iOS 反应原生项目上出现奇怪的错误
- database - 使用 Datatable 加快记录加载时间 laravel
- laravel - Laravel Cashier 通过订阅创建更多关系
- ios - SWIFT:在 TVOS 中退出收集时强制焦点引擎转到按钮
- docker - 如何构建和推送 Docker 镜像?