首页 > 解决方案 > mcmapply 在多核上的性能

问题描述

我有一个要在大约 300 万个数据点上运行的函数。我正在尝试在具有 8 个内核mcmapplyUbuntu机器上并行化该功能。该函数接受list长度为 300 万的 a 以及另外 3 个长度为 300 万的向量和 1 个常量值cutoffyearmon

该代码在单核上 2 分钟内运行 100000 行数据,运行良好,并且没有抛出错误。但是,当我尝试在我的机器的 6 个内核上并行运行代码时,mcmapply它会持续运行超过 5 个小时。

更新:这是我的函数调用的淡化版本。我为 1 个月、2 个月和 3 个月的持续时间创建了另外 9 个变量。我只考虑了 6 个月和 1 年的时间变量。

我正在使用以下函数调用:

abc_xx_last_xxx_days=mcmapply(function(abcstrnew,sd,naflag,empflag,daysdiff,cutoffyearmon){
abcstrnew=if((!naflag) & (!empflag)){
    substring(text = abcstrnew,first = seq(from = 1,to = (nchar(abcstrnew)-2),by = 3),last = seq(from = 3,to = (nchar(abcstrnew)),by = 3))
}else{
    if(!is.na(empflag) & empflag){
        ""
    }else{
        NA_character_
    }
}

abcstrnew=if((!naflag) & (!empflag)){
    as.numeric(abcstrnew)
}else{
    if(!is.na(empflag) & empflag){
        as.numeric(0)
    }else{
        NA_real_
    }
}
if(is.na(daysdiff)){
  return(list(worst_abc_ever=NA_real_,
              times_abc=NA_real_,
              times_abc_last_180_days=NA_real_,
              times_abc_last_365_days=NA_real_,
              times_abc30_last_365_days=NA_real_,
              times_abc30_last_180_days=NA_real_,
              times_abc60_last_365_days=NA_real_,
              times_abc60_last_180_days=NA_real_,
              abc_last_180_days=NA_real_,
              abc_last_365_days=NA_real_
  ))
}else{
  if((!naflag)&(!empflag)){
    abcstrlen=length(abcstrnew)
    worst_abc_ever=max(abcstrnew)
    times_abc=as.numeric(length(which(abcstrnew>0)))

    if(daysdiff>365){
      abc_last_365_days=as.numeric(0)
      times_abc30_last_365_days=as.numeric(0)
      times_abc60_last_365_days=as.numeric(0)
      times_abc_last_365_days=as.numeric(0)
    }else{
      abcmonthstwelve=12-round(round(difftime(time1 = cutoffyearmon,time2 = as.yearmon(sd)))/30)

      if(abcstrlen>=abcmonthstwelve){
        abc_last_365_days=(max(abcstrnew[1:abcmonthstwelve]))
      }else{
        abc_last_365_days=(max(abcstrnew[1:abcstrlen]))
      }


      if(abcstrlen>=abcmonthstwelve){
        times_abc30_last_365_days=as.numeric(length(which(abcstrnew[1:abcmonthstwelve]>=30)))
      }else{
        times_abc30_last_365_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>=30)))
      }


      if(abcstrlen>=abcmonthstwelve){
        times_abc60_last_365_days=as.numeric(length(which(abcstrnew[1:abcmonthstwelve]>=60)))
      }else{
        times_abc60_last_365_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>=60)))
      }


      if(abcstrlen>=abcmonthstwelve){
        times_abc_last_365_days=as.numeric(length(which(abcstrnew[1:abcmonthstwelve]>0)))
      }else{
        times_abc_last_365_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>0)))
      }
    }


    if(daysdiff>180){
      abc_last_180_days=as.numeric(0)
      times_abc30_last_180_days=as.numeric(0)
      times_abc60_last_180_days=as.numeric(0)
      times_abc_last_180_days=as.numeric(0)
    }else{
      abcmonthssix=6-round(round(difftime(time1 = cutoffyearmon,time2 = as.yearmon(sd)))/30)

      if(abcstrlen>=abcmonthssix){
        abc_last_180_days=(max(abcstrnew[1:abcmonthssix]))
      }else{
        abc_last_180_days=(max(abcstrnew[1:abcstrlen]))
      }


      if(abcstrlen>=abcmonthssix){
        times_abc30_last_180_days=as.numeric(length(which(abcstrnew[1:abcmonthssix]>=30)))
      }else{
        times_abc30_last_180_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>=30)))
      }


      if(abcstrlen>=abcmonthssix){
        times_abc60_last_180_days=as.numeric(length(which(abcstrnew[1:abcmonthssix]>=60)))
      }else{
        times_abc60_last_180_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>=60)))
      }


      if(abcstrlen>=abcmonthssix){
        times_abc_last_180_days=as.numeric(length(which(abcstrnew[1:abcmonthssix]>0)))
      }else{
        times_abc_last_180_days=as.numeric(length(which(abcstrnew[1:abcstrlen]>0)))
      }

    }

    return(list(worst_abc_ever=worst_abc_ever,
                times_abc=times_abc,
                times_abc_last_180_days=times_abc_last_180_days,
                times_abc_last_365_days=times_abc_last_365_days,
                times_abc30_last_365_days=times_abc30_last_365_days,
                times_abc30_last_180_days=times_abc30_last_180_days,
                times_abc60_last_365_days=times_abc60_last_365_days,
                times_abc60_last_180_days=times_abc60_last_180_days,
                abc_last_180_days=abc_last_180_days,
                abc_last_365_days=abc_last_365_days
    ))
  }else{
    return(list(worst_abc_ever=NA_real_,
                times_abc=NA_real_,
                times_abc_last_180_days=NA_real_,
                times_abc_last_365_days=NA_real_,
                times_abc30_last_365_days=NA_real_,
                times_abc30_last_180_days=NA_real_,
                times_abc60_last_365_days=NA_real_,
                times_abc60_last_180_days=NA_real_,
                abc_last_180_days=NA_real_,
                abc_last_365_days=NA_real_
    ))
  }
}
},lst,sd,naflag,empflag,daysdiff,cutoffyearmon,mc.cores=6, mc.preschedule=TRUE, mc.cleanup=TRUE)

您可以使用以下一组输入来运行该函数并检查其输出。

lst=list("000050000032","000000340000000000000")
sd=c(as.Date.character("2017-05-22"),as.Date.character("2017-04-23"))
empflag=c(FALSE,FALSE)
naflag=c(FALSE,FALSE)
daysdiff=difftime(time1 = as.Date.character("2017-06-30"),time2 = sd)
cutoffyearmon=as.yearmon("2017-06-30")

我假设代码将通过分配mc.preschedule=TRUE. 但是我看不到处理速度的任何显着表现。在机器的 6 个内核上运行时,我预计处理将在大约 1.5 小时内完成。

如果我错过了什么,有什么建议。

当使用 pbmcmapply 时,mc.cores=6我得到的 ETA 为 06:01:32:57

标签: rparallel-processingmapplymclapply

解决方案


推荐阅读