首页 > 解决方案 > 优化 withColumn when 子句

问题描述

我有以下代码

def GetCompletionForS4(location: String): DataFrame = {

var dfSubSystem = GetTasksFor(location, "S4").as("Tasks")
.join(GetCertsFor(location, "S4").as("Certs"),$"Tasks.SystemX" === $"Certs.SystemX" && $"Tasks.StageX" === $"Certs.StageX" , "outer")
.join(GetPTTasksFor(location, "S4").as("PT"), $"Tasks.SystemX" === $"PT.SystemX" && $"Tasks.StageX"=== $"PT.StageX", "outer")
  .withColumn("SystemizationId", coalesce(col("Tasks.SystemX"), col("Certs.SystemX"), col("PT.SystemX")))
  .withColumn("CommissioningStage", coalesce(col("Tasks.StageX"), col("Certs.StageX"), col("PT.StageX")))
  .withColumn("fPercentageClosed", when((col("PT.SystemX")).isNull,  coalesce(col("Tasks.CountX"), lit(0)).cast("double") * 0.9  + coalesce(col("Certs.CountX"), lit(0)).cast("double") * 0.1)
                                   .otherwise(coalesce(col("Tasks.CountX"), lit(0)).cast("double") * 0.6 + coalesce(col("PT.CountX"), lit(0)).cast("double") * 0.3  + coalesce(col("Certs.CountX"), lit(0)).cast("double") * 0.1)
             )
.withColumn("fActualStartDate", when(col("Tasks.ActualStartDateX").isNull,
                                     when(col("Certs.ActualStartDateX").isNull, col("PT.ActualStartDateX"))
                                     .otherwise(
                                                 when(col("PT.ActualStartDateX").isNull, col("Certs.ActualStartDateX"))
                                                 .otherwise(
                                                             when(col("Certs.ActualStartDateX")< col("PT.ActualStartDateX"), col("Certs.ActualStartDateX")).otherwise(col("PT.ActualStartDateX"))
                                                           )
                                               )
                                    )
                                    .otherwise(
                                                when(col("Certs.ActualStartDateX").isNull, 
                                                     when(col("PT.ActualStartDateX").isNull, col("Tasks.ActualStartDateX")).otherwise(
                                                                                                                                       when(col("PT.ActualStartDateX") < col("Tasks.ActualStartDateX"), col("PT.ActualStartDateX")).otherwise(col("Tasks.ActualStartDateX")) 
                                                                                                                                     ) 
                                                    )
                                                .otherwise(
                                                            when(col("PT.ActualStartDateX").isNull,
                                                                  when(col("Certs.ActualStartDateX") < col("Tasks.ActualStartDateX"), col("Certs.ActualStartDateX")).otherwise(col("Tasks.ActualStartDateX"))
                                                                )
                                                            .otherwise(
                                                                        when(col("Certs.ActualStartDateX") < col("Tasks.ActualStartDateX") , 
                                                                             when(col("Certs.ActualStartDateX") < col("PT.ActualStartDateX"), col("Certs.ActualStartDateX")).otherwise(col("PT.ActualStartDateX"))
                                                                            )
                                                                        .otherwise(
                                                                                    when(col("Tasks.ActualStartDateX") < col("PT.ActualStartDateX"), col("Tasks.ActualStartDateX")).otherwise(col("PT.ActualStartDateX"))
                                                                                  )

                                                                      )
                                                          )
                                              )

           )
.withColumn("fActualEndDate", when(col("PT.SystemX").isNull,
                                   when(col("Tasks.ActualEndDateX").isNull,null)
                                   .otherwise(
                                      when(col("Certs.ActualEndDateX").isNull, null)
                                     .otherwise(
                                       when(col("Tasks.ActualEndDateX") > col("Certs.ActualEndDateX"), col("Tasks.ActualEndDateX"))
                                       .otherwise(col("Certs.ActualEndDateX"))
                                     )
                                    )
                                  )
                              .otherwise(
                                          when(col("PT.ActualEndDateX").isNull || col("Certs.ActualEndDateX").isNull || col("Tasks.ActualEndDateX").isNull, null)
                                         .otherwise(
                                                    when(col("Tasks.ActualEndDateX") > col("Certs.ActualEndDateX"),
                                                           when(col("Tasks.ActualEndDateX") > col("PT.ActualEndDateX") , col("Tasks.ActualEndDateX")).otherwise(col("PT.ActualEndDateX"))
                                                        )
                                                    .otherwise(
                                                                 when(col("Certs.ActualEndDateX") > col("PT.ActualEndDateX") , col("Certs.ActualEndDateX")).otherwise(col("PT.ActualEndDateX"))
                                                              )
                                                   )
                                        )
           )
  .select("SystemizationId",
          "CommissioningStage",
          "fPercentageClosed",
          "fActualStartDate",
          "fActualEndDate"
         )


return dfSubSystem
}

如何优化 withColumn 内的这种疯狂的嵌套时间?它不可读或不可维护。编写此查询的更好方法是什么。有新的要求为这些嵌套的时间增加了更多的复杂性,因此不确定执行此操作的最佳方法是什么。

先感谢您

标签: scalaapache-sparkazure-databricks

解决方案


推荐阅读