首页 > 解决方案 > for循环的优化和并行化

问题描述

我的代码有问题。首先,我认为它是一个双重代码,因为一个是一组函数,用于嵌套 for 循环的主代码,就像这样:

hijos <- function(con, clasification, element){
  example <- sprintf("(select hcodniv2 from (select pcodniv1, hcodniv1, pcodniv2, hcodniv2, level from stock.clasificaciones where pcodniv1=%d and hcodniv1=pcodniv1 start with pcodniv2=%d connect by prior hcodniv2=pcodniv2)) minus (select pcodniv2 from (select pcodniv1, hcodniv1, pcodniv2, hcodniv2, level from stock.clasificaciones where pcodniv1=%d and hcodniv1=pcodniv1 start with pcodniv2=%d connect by prior hcodniv2=pcodniv2))", clasification, element, clasification, element)
  codhijos <- dbSendQuery(con, example)
  hijos_df <- fetch(codhijos, n=-1)
  if (length(hijos_df$HCODNIV2) == 0) {
    hijos_df <- data.frame("HCODNIV2" = element)
  }
  dbClearResult(codhijos)
  hijos_df
}

hijos_string <- function(con, clasification, element) {
  df1 <- hijos(con, clasification, element)
  df1 <- paste(df1$HCODNIV2,collapse=",")
  df1
}


resultado <- function(datos_inversion, classProduct, classBranch, year, product, branch, df1, df2) {
year_1 = year - 1
df5 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 76 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 in (%s) and CODNIV5 in (%s)",classProduct, classBranch, year_1, df1, df2))
df4 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 54 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 in (%s) and CODNIV5 in (%s)",classProduct, classBranch, year_1, df1, df2))
df3 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 54 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 in (%s) and CODNIV5 in (%s)",classProduct, classBranch, year, df1, df2))
df2 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 76 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 in (%s) and CODNIV5 in (%s)",classProduct, classBranch, year, df1, df2))
df7 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 54 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 = %d and CODNIV5= %d",classProduct, classBranch, year, product, branch))
df6 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 54 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 = %d and CODNIV5= %d",classProduct, classBranch,  year_1, product, branch))
colnames(df5)[1] <- 'VALOR_IR14'
colnames(df4)[1] <- 'VALOR_IN14'
colnames(df3)[1] <- 'VALOR_IN15'
colnames(df2)[1] <- 'VALOR_IR15'
colnames(df6)[1] <- 'VALOR_IN14_tot'
colnames(df7)[1] <- 'VALOR_IN15_tot'
colnames(df5)[3] <- 'CODUNIDAD_IR'
colnames(df2)[3] <- 'CODUNIDAD_IR'
colnames(df3)[3] <- 'CODUNIDAD_IN'
colnames(df4)[3] <- 'CODUNIDAD_IN'
df6 <- cbind(df6, df7[!names(df7) %in% names(df6)])
df5 <- cbind(df5, df2[!names(df2) %in% names(df5)])
df4 <- cbind(df4, df3[!names(df3) %in% names(df4)])
newdf <- cbind(df5, df4[!names(df4) %in% names(df5)])
newdf$RES1p = log(newdf$VALOR_IR14)
newdf$RES1p_15 = log(newdf$VALOR_IR15)
newdf$Res1p_tot = newdf$RES1p_15 - newdf$RES1p
newdf$Res2p = newdf$VALOR_IN14/ df6$VALOR_IN14
newdf$Res2p_15 = newdf$VALOR_IN15/ df6$VALOR_IN15
newdf$Res2p_tot = (newdf$Res2p + newdf$Res2p_15)/ 2
newdf$RESULTADO = newdf$Res1p_tot * newdf$Res2p_tot
df_resultados = sum(newdf$RESULTADO, na.rm = TRUE)
data_frame <- data.frame("ANYO" = year, "RESULTADO" = df_resultados, "PRODUCTO" = product, "RAMA" = branch, "CLASE_PR" = classProduct, "CLASE_R" = classBranch) 
data_frame
} 

这些函数只是根据 Oracle 数据框中的数据进行计算。

代码本身是这样的:

x = 1965:2015
d = 2000
g = 1000
y = c(2026,2017,2019,2023)
z = c(1034,1002,1056,1057)
start <- proc.time() # Start clock
dat <- data.frame()
for (year in x) {
  for (classProduct in d){
   for (product in y) {
      prod_string <- hijos_string(con, classProduct, product)
  product_df <- c(product, prod_string)
  product_df <- paste(product_df, collapse = ",")
  for (classBranch in g){
    for (branch in z) {
      branch_string <- hijos_string(con, classBranch, branch)
      branch_df <- c(branch, branch_string)
      branch_df <- paste(branch_df, collapse = ",")

      consulta <- sprintf("select * from stock.V_CALCULOS where  CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and CODNIV3 in (%s) and CODNIV5 in (%s) order by codniv1, codniv2, codniv3, codniv4, codniv5, codlugar, codunidad, anyo",classProduct, classBranch, product_df, branch_df)
      resuni <- dbSendQuery(con, consulta)
      datos_inversion <- fetch(resuni, n=-1)
      dbClearResult(resuni)

      df<-resultado(datos_inversion, classProduct, classBranch, year, product, branch, prod_string, branch_string)
      dat <- rbind(dat, df)
        }
      }
    }
  }
}  
print(dat)
time_elapsed_parallel <- proc.time() - start # End clock
time_elapsed_parallel

dbDisconnect(con) 
  1. 所以,我的问题是..我需要以某种方式并行化此代码以同时使用多个内核。同时,软件包 pbdR、multicore y 等不适用于我正在使用的 RStudio 版本(3.4.4)。
  2. 我正在考虑优化 for 循环以更有效地使用并行代码,但不知道该怎么做。

所以,最后,我的问题:任何想法如何优化 for 循环以及在如何并行化它之后,在计划中(也许)如何拆分它或如何让它同时使用 4 个内核执行并保存时间,bc 它需要永远执行,我需要尽可能少(数据框初始有超过 250 000 行)?我使用 4 核的 Windows 虚拟机

我在这里先向您的帮助表示感谢!

PS。在最终数据框中按行添加结果是精确的

标签: rdatabasefor-loopdataframeparallel-processing

解决方案


data.frame逐行迭代地构建一个可怕的规模。相反,我建议将每个迭代的输出创建到 alist中,然后在最后绑定。我没有你的数据,所以这是未经测试的。我还将假设 Matt 关于 1,632 个 SQL 查询的评论是不可避免的(因为这可能是一个很大的瓶颈)。

底线:您的问题区域可能是 (1) 数千个 SQL 查询,以及 (2) 使用rbind.

这会预先构建嵌套for循环中的所有迭代。

eg <- expand.grid(Year=x, classProduct=d, product=y, classBranch=g, branch=z)

现在让我们添加_df_string变量;我在dplyr这里使用分组等,尽管还有其他方法(base-R 和data.table):

library(dplyr)
eg <- eg %>%
  group_by(classProduct, product) %>%
  mutate(
    prod_string = hijos_string(con, classProduct[1], product[1]),
    product_df = paste(product, prod_string[1], sep=",")
  ) %>%
  group_by(classBranch, branch) %>%
  mutate(
    branch_string = hijos_string(con, classProduct[1], branch[1]),
    branch_df = paste(branch, branch_string[1], sep=",")
  ) %>%
  ungroup()

根据马特的评论,这仍然是低效的,但从迭代添加行的角度来看,效率要低得多:

lst <- lapply(seq_len(nrow(eg)), function(i) {
  consulta <- sprintf("select * from stock.V_CALCULOS where  CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and CODNIV3 in (%s) and CODNIV5 in (%s) order by codniv1, codniv2, codniv3, codniv4, codniv5, codlugar, codunidad, anyo",
                      eg$classProduct[i], eg$classBranch[i], eg$product_df[i], eg$branch_df[i])
  resuni <- dbSendQuery(con, consulta)
  datos_inversion <- fetch(resuni, n=-1)
  dbClearResult(resuni)
  resultado(datos_inversion, eg$classProduct[i], eg$classBranch[i], eg$year[i], eg$product[i],
            eg$branch[i], eg$prod_string[i], eg$branch_string[i])
})
dat <- do.call(rbind, lst)

同样,我不知道这是否会完美运行,但基本原理是合理的并且(我认为)相对直截了当。


推荐阅读