首页 > 解决方案 > 如何使用 Seq(..).toDF(..) 语法创建超过 22 列的 spark 数据框

问题描述

我正在尝试创建这样的dataframe东西

       Seq
  .empty[(String, String, String, String, String, String, String, String, String, String, String, String, String,String,
          String, String, String, String, String, String, String, String, String
  )]
  .toDF(Seq(EmployeeJobFieldsName.EMP_ID, EmployeeJobFieldsName.FIRST_NAME, EmployeeJobFieldsName.LAST_NAME, EmployeeJobFieldsName.FULL_NAME,
    EmployeeJobFieldsName.DOB, EmployeeJobFieldsName.GOVT_ID_TYPE, EmployeeJobFieldsName.GOVT_ID_TYPE_COUNTRY, EmployeeJobFieldsName.GOVT_ID_VALUE,
    EmployeeJobFieldsName.WORK_EMAIL, EmployeeJobFieldsName.PHONE, EmployeeJobFieldsName.PERSONAL_EMAIL, EmployeeJobFieldsName.HOME_ADDRESS_LINE_2,
    EmployeeJobFieldsName.HOME_ADDRESS_LINE_2, EmployeeJobFieldsName.HOME_CITY, EmployeeJobFieldsName.HOME_STATE, EmployeeJobFieldsName.HOME_COUNTRY,
    EmployeeJobFieldsName.HOME_POSTAL, EmployeeJobFieldsName.MAIL_ADDRESS_LINE_2, EmployeeJobFieldsName.MAIL_ADDRESS_LINE_2, EmployeeJobFieldsName.MAIL_CITY,
    EmployeeJobFieldsName.MAIL_STATE, EmployeeJobFieldsName.MAIL_COUNTRY, EmployeeJobFieldsName.MAIL_POSTAL, EmployeeJobFieldsName.EFF_END_DATE,
    EmployeeJobFieldsName.EFF_START_DATE, EmployeeJobFieldsName.EMP_STATUS, EmployeeJobFieldsName.STD_HOURS, EmployeeJobFieldsName.VENDOR_PAY_GROUP,
    EmployeeJobFieldsName.AMAZON_PAY_GROUP, EmployeeJobFieldsName.PAY_FREQUENCY, EmployeeJobFieldsName.EMP_TYPE, EmployeeJobFieldsName.COMP_FREQUENCY,
    EmployeeJobFieldsName.WORK_LOCATION, EmployeeJobFieldsName.WORK_LOCATION_TIME_ZONE, EmployeeJobFieldsName.WORK_STATE, EmployeeJobFieldsName.LAST_HIRE_DATE,
    EmployeeJobFieldsName.TERM_DATE, EmployeeJobFieldsName.COMPANY, EmployeeJobFieldsName.COMPANY_COUNTRY_ISO3, EmployeeJobFieldsName.COMPANY_COUNTRY_ISO2,
    EmployeeJobFieldsName.LOGIN_ID, EmployeeJobFieldsName.JOB_LEVEL, EmployeeJobFieldsName.DEPT_ID, EmployeeJobFieldsName.REG_OR_TEMP,
    EmployeeJobFieldsName.FULL_OR_PART_TIME, EmployeeJobFieldsName.EMP_CLASS, EmployeeJobFieldsName.JOB_CODE, EmployeeJobFieldsName.PAYROLL_ID,
    EmployeeJobFieldsName.SCHEDULE, EmployeeJobFieldsName.CURRENCY_CODE, EmployeeJobFieldsName.ANNUAL_RATE, EmployeeJobFieldsName.COMP_RATE,
    EmployeeJobFieldsName.HOURLY_RATE, EmployeeJobFieldsName.SHIFT_DIFFERENTIAL, EmployeeJobFieldsName.COMPANY_NAME, EmployeeJobFieldsName.COMPANY_DESC,
    EmployeeJobFieldsName.COMPANY_ADDRESS_LINE_2, EmployeeJobFieldsName.COMPANY_ADDRESS_LINE_2, EmployeeJobFieldsName.COMPANY_CITY,
    EmployeeJobFieldsName.COMPANY_STATE, EmployeeJobFieldsName.COMPANY_POSTAL): _*)

如果我超过 22,我会收到错误。任何帮助,如何摆脱这个

标签: dataframeapache-spark-sql

解决方案


由于 scala 只有Tuple22&Product22定义,您可以创建 Tuple22 的数据框,如下 -

 val df1 = Seq.empty[(
      String, String, String, String, String, String, String, String, String, String,
        String, String, String, String, String, String, String, String, String, String,
        String, String)].toDF(Range(1, 23).map(s => s"col$s") : _*)
    df1.show(false)
    /**
      * +----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
      * |col1|col2|col3|col4|col5|col6|col7|col8|col9|col10|col11|col12|col13|col14|col15|col16|col17|col18|col19|col20|col21|col22|
      * +----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
      * +----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
      */

由于Tuple23&Product23未定义,您不能使用上述语法为超过 22 列创建数据框

Update-1 创建超过 22 列的数据框say 23

1. 定义Product23Tuple23

Product23.scala


object Product23 {
  def unapply[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23]
  (x: Product23[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23])
  : Option[Product23[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23]] =
    Some(x)
}

/** Product23 is a cartesian product of 23 components.
  *  @since 2.3
  */
trait Product23[+T1, +T2, +T3, +T4, +T5, +T6, +T7, +T8, +T9, +T10, +T11, +T12, +T13, +T14, +T15, +T16, +T17, +T18, +T19, +T20, +T21, +T22, +T23] extends Any with Product {
  /** The arity of this product.
    *  @return 23
    */
  override def productArity = 23


  /** Returns the n-th projection of this product if 0 <= n < productArity,
    *  otherwise throws an `IndexOutOfBoundsException`.
    *
    *  @param n number of the projection to be returned
    *  @return  same as `._(n+1)`, for example `productElement(0)` is the same as `._1`.
    *  @throws  IndexOutOfBoundsException
    */

  @throws(classOf[IndexOutOfBoundsException])
  override def productElement(n: Int) = n match {
    case 0 => _1
    case 1 => _2
    case 2 => _3
    case 3 => _4
    case 4 => _5
    case 5 => _6
    case 6 => _7
    case 7 => _8
    case 8 => _9
    case 9 => _10
    case 10 => _11
    case 11 => _12
    case 12 => _13
    case 13 => _14
    case 14 => _15
    case 15 => _16
    case 16 => _17
    case 17 => _18
    case 18 => _19
    case 19 => _20
    case 20 => _21
    case 21 => _22
    case 22 => _23
    case _ => throw new IndexOutOfBoundsException(n.toString())
  }

  /** A projection of element 1 of this Product.
    *  @return   A projection of element 1.
    */
  def _1: T1
  /** A projection of element 2 of this Product.
    *  @return   A projection of element 2.
    */
  def _2: T2
  /** A projection of element 3 of this Product.
    *  @return   A projection of element 3.
    */
  def _3: T3
  /** A projection of element 4 of this Product.
    *  @return   A projection of element 4.
    */
  def _4: T4
  /** A projection of element 5 of this Product.
    *  @return   A projection of element 5.
    */
  def _5: T5
  /** A projection of element 6 of this Product.
    *  @return   A projection of element 6.
    */
  def _6: T6
  /** A projection of element 7 of this Product.
    *  @return   A projection of element 7.
    */
  def _7: T7
  /** A projection of element 8 of this Product.
    *  @return   A projection of element 8.
    */
  def _8: T8
  /** A projection of element 9 of this Product.
    *  @return   A projection of element 9.
    */
  def _9: T9
  /** A projection of element 10 of this Product.
    *  @return   A projection of element 10.
    */
  def _10: T10
  /** A projection of element 11 of this Product.
    *  @return   A projection of element 11.
    */
  def _11: T11
  /** A projection of element 12 of this Product.
    *  @return   A projection of element 12.
    */
  def _12: T12
  /** A projection of element 13 of this Product.
    *  @return   A projection of element 13.
    */
  def _13: T13
  /** A projection of element 14 of this Product.
    *  @return   A projection of element 14.
    */
  def _14: T14
  /** A projection of element 15 of this Product.
    *  @return   A projection of element 15.
    */
  def _15: T15
  /** A projection of element 16 of this Product.
    *  @return   A projection of element 16.
    */
  def _16: T16
  /** A projection of element 17 of this Product.
    *  @return   A projection of element 17.
    */
  def _17: T17
  /** A projection of element 18 of this Product.
    *  @return   A projection of element 18.
    */
  def _18: T18
  /** A projection of element 19 of this Product.
    *  @return   A projection of element 19.
    */
  def _19: T19
  /** A projection of element 20 of this Product.
    *  @return   A projection of element 20.
    */
  def _20: T20
  /** A projection of element 21 of this Product.
    *  @return   A projection of element 21.
    */
  def _21: T21
  /** A projection of element 22 of this Product.
    *  @return   A projection of element 22.
    */
  def _22: T22
  /** A projection of element 23 of this Product.
    *  @return   A projection of element 23.
    */
  def _23: T23


}

Tuple23.scala


case class Tuple23[+T1, +T2, +T3, +T4, +T5, +T6, +T7, +T8, +T9, +T10, +T11, +T12, +T13, +T14, +T15, +T16, +T17, +T18, +T19, +T20, +T21, +T22, +T23]
(_1: T1, _2: T2, _3: T3, _4: T4, _5: T5, _6: T6, _7: T7, _8: T8, _9: T9, _10: T10, _11: T11, _12: T12, _13: T13, _14: T14, _15: T15, _16: T16, _17: T17, _18: T18, _19: T19, _20: T20, _21: T21, _22: T22, _23: T23)
  extends Product23[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23]
{
  override def toString() = "(" + _1 + "," + _2 + "," + _3 + "," + _4 + "," + _5 + "," + _6 + "," + _7 + "," + _8 + "," + _9 + "," + _10 + "," + _11 +
    "," + _12 + "," + _13 + "," + _14 + "," + _15 + "," + _16 + "," + _17 + "," + _18 + "," + _19 + "," + _20 + "," + _21 + "," + _22 + "," + _23 + ")"

}

2.使用新创建的数据框创建Product23Tuple23

 // To create dataframe of Seq[Tuple23[String]], you need to create Product23 and
    val df2 = Seq.empty[Tuple23[
      String, String, String, String, String, String, String, String, String, String,
      String, String, String, String, String, String, String, String, String, String,
      String, String, String]].toDF(Range(1, 24).map(s => s"col$s"): _*)
    df2.show(false)

    /**
      * +----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
      * |col1|col2|col3|col4|col5|col6|col7|col8|col9|col10|col11|col12|col13|col14|col15|col16|col17|col18|col19|col20|col21|col22|col23|
      * +----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
      * +----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
      */

推荐阅读