r - for循环的优化和并行化
问题描述
我的代码有问题。首先,我认为它是一个双重代码,因为一个是一组函数,用于嵌套 for 循环的主代码,就像这样:
hijos <- function(con, clasification, element){
example <- sprintf("(select hcodniv2 from (select pcodniv1, hcodniv1, pcodniv2, hcodniv2, level from stock.clasificaciones where pcodniv1=%d and hcodniv1=pcodniv1 start with pcodniv2=%d connect by prior hcodniv2=pcodniv2)) minus (select pcodniv2 from (select pcodniv1, hcodniv1, pcodniv2, hcodniv2, level from stock.clasificaciones where pcodniv1=%d and hcodniv1=pcodniv1 start with pcodniv2=%d connect by prior hcodniv2=pcodniv2))", clasification, element, clasification, element)
codhijos <- dbSendQuery(con, example)
hijos_df <- fetch(codhijos, n=-1)
if (length(hijos_df$HCODNIV2) == 0) {
hijos_df <- data.frame("HCODNIV2" = element)
}
dbClearResult(codhijos)
hijos_df
}
hijos_string <- function(con, clasification, element) {
df1 <- hijos(con, clasification, element)
df1 <- paste(df1$HCODNIV2,collapse=",")
df1
}
resultado <- function(datos_inversion, classProduct, classBranch, year, product, branch, df1, df2) {
year_1 = year - 1
df5 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 76 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 in (%s) and CODNIV5 in (%s)",classProduct, classBranch, year_1, df1, df2))
df4 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 54 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 in (%s) and CODNIV5 in (%s)",classProduct, classBranch, year_1, df1, df2))
df3 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 54 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 in (%s) and CODNIV5 in (%s)",classProduct, classBranch, year, df1, df2))
df2 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 76 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 in (%s) and CODNIV5 in (%s)",classProduct, classBranch, year, df1, df2))
df7 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 54 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 = %d and CODNIV5= %d",classProduct, classBranch, year, product, branch))
df6 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 54 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 = %d and CODNIV5= %d",classProduct, classBranch, year_1, product, branch))
colnames(df5)[1] <- 'VALOR_IR14'
colnames(df4)[1] <- 'VALOR_IN14'
colnames(df3)[1] <- 'VALOR_IN15'
colnames(df2)[1] <- 'VALOR_IR15'
colnames(df6)[1] <- 'VALOR_IN14_tot'
colnames(df7)[1] <- 'VALOR_IN15_tot'
colnames(df5)[3] <- 'CODUNIDAD_IR'
colnames(df2)[3] <- 'CODUNIDAD_IR'
colnames(df3)[3] <- 'CODUNIDAD_IN'
colnames(df4)[3] <- 'CODUNIDAD_IN'
df6 <- cbind(df6, df7[!names(df7) %in% names(df6)])
df5 <- cbind(df5, df2[!names(df2) %in% names(df5)])
df4 <- cbind(df4, df3[!names(df3) %in% names(df4)])
newdf <- cbind(df5, df4[!names(df4) %in% names(df5)])
newdf$RES1p = log(newdf$VALOR_IR14)
newdf$RES1p_15 = log(newdf$VALOR_IR15)
newdf$Res1p_tot = newdf$RES1p_15 - newdf$RES1p
newdf$Res2p = newdf$VALOR_IN14/ df6$VALOR_IN14
newdf$Res2p_15 = newdf$VALOR_IN15/ df6$VALOR_IN15
newdf$Res2p_tot = (newdf$Res2p + newdf$Res2p_15)/ 2
newdf$RESULTADO = newdf$Res1p_tot * newdf$Res2p_tot
df_resultados = sum(newdf$RESULTADO, na.rm = TRUE)
data_frame <- data.frame("ANYO" = year, "RESULTADO" = df_resultados, "PRODUCTO" = product, "RAMA" = branch, "CLASE_PR" = classProduct, "CLASE_R" = classBranch)
data_frame
}
这些函数只是根据 Oracle 数据框中的数据进行计算。
代码本身是这样的:
x = 1965:2015
d = 2000
g = 1000
y = c(2026,2017,2019,2023)
z = c(1034,1002,1056,1057)
start <- proc.time() # Start clock
dat <- data.frame()
for (year in x) {
for (classProduct in d){
for (product in y) {
prod_string <- hijos_string(con, classProduct, product)
product_df <- c(product, prod_string)
product_df <- paste(product_df, collapse = ",")
for (classBranch in g){
for (branch in z) {
branch_string <- hijos_string(con, classBranch, branch)
branch_df <- c(branch, branch_string)
branch_df <- paste(branch_df, collapse = ",")
consulta <- sprintf("select * from stock.V_CALCULOS where CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and CODNIV3 in (%s) and CODNIV5 in (%s) order by codniv1, codniv2, codniv3, codniv4, codniv5, codlugar, codunidad, anyo",classProduct, classBranch, product_df, branch_df)
resuni <- dbSendQuery(con, consulta)
datos_inversion <- fetch(resuni, n=-1)
dbClearResult(resuni)
df<-resultado(datos_inversion, classProduct, classBranch, year, product, branch, prod_string, branch_string)
dat <- rbind(dat, df)
}
}
}
}
}
print(dat)
time_elapsed_parallel <- proc.time() - start # End clock
time_elapsed_parallel
dbDisconnect(con)
- 所以,我的问题是..我需要以某种方式并行化此代码以同时使用多个内核。同时,软件包 pbdR、multicore y 等不适用于我正在使用的 RStudio 版本(3.4.4)。
- 我正在考虑优化 for 循环以更有效地使用并行代码,但不知道该怎么做。
所以,最后,我的问题:任何想法如何优化 for 循环以及在如何并行化它之后,在计划中(也许)如何拆分它或如何让它同时使用 4 个内核执行并保存时间,bc 它需要永远执行,我需要尽可能少(数据框初始有超过 250 000 行)?我使用 4 核的 Windows 虚拟机
我在这里先向您的帮助表示感谢!
PS。在最终数据框中按行添加结果是精确的
解决方案
data.frame
逐行迭代地构建一个可怕的规模。相反,我建议将每个迭代的输出创建到 alist
中,然后在最后绑定。我没有你的数据,所以这是未经测试的。我还将假设 Matt 关于 1,632 个 SQL 查询的评论是不可避免的(因为这可能是一个很大的瓶颈)。
底线:您的问题区域可能是 (1) 数千个 SQL 查询,以及 (2) 使用rbind
.
这会预先构建嵌套for
循环中的所有迭代。
eg <- expand.grid(Year=x, classProduct=d, product=y, classBranch=g, branch=z)
现在让我们添加_df
和_string
变量;我在dplyr
这里使用分组等,尽管还有其他方法(base-R 和data.table
):
library(dplyr)
eg <- eg %>%
group_by(classProduct, product) %>%
mutate(
prod_string = hijos_string(con, classProduct[1], product[1]),
product_df = paste(product, prod_string[1], sep=",")
) %>%
group_by(classBranch, branch) %>%
mutate(
branch_string = hijos_string(con, classProduct[1], branch[1]),
branch_df = paste(branch, branch_string[1], sep=",")
) %>%
ungroup()
根据马特的评论,这仍然是低效的,但从迭代添加行的角度来看,效率要低得多:
lst <- lapply(seq_len(nrow(eg)), function(i) {
consulta <- sprintf("select * from stock.V_CALCULOS where CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and CODNIV3 in (%s) and CODNIV5 in (%s) order by codniv1, codniv2, codniv3, codniv4, codniv5, codlugar, codunidad, anyo",
eg$classProduct[i], eg$classBranch[i], eg$product_df[i], eg$branch_df[i])
resuni <- dbSendQuery(con, consulta)
datos_inversion <- fetch(resuni, n=-1)
dbClearResult(resuni)
resultado(datos_inversion, eg$classProduct[i], eg$classBranch[i], eg$year[i], eg$product[i],
eg$branch[i], eg$prod_string[i], eg$branch_string[i])
})
dat <- do.call(rbind, lst)
同样,我不知道这是否会完美运行,但基本原理是合理的并且(我认为)相对直截了当。
推荐阅读
- python - 创建一个计算单词和字符的函数(包括标点符号,但不包括空格)
- json - 规范化 REST API 的 redux 状态
- php - BAD / FAKE页面重定向到404页面不起作用
- java - 将多个表(相同字段)映射到单个实体 - Hibernate
- r - R Markdown参数和变量之间的差异
- javascript - 如何获取由唯一 ID 号标识的文档,但 firebase auth 仅在登录时提供电子邮件?
- python - lxml.html.fromstring 返回
- python - 获取 AWS EC2 特定标签/值组合 + 实例 ID
- python - 如何修复 TypeError: *: 'method' 和 'float' 不支持的操作数类型
- python - 缓慢的“统计”功能