首页 > 解决方案 > 为什么 Spark Dataset 用 Double 的元组将 null 替换为 -1.0?

问题描述

以下是在 Spark shell 中重现的简单步骤:

scala> case class Foo(d: Option[Double])
defined class Foo

scala> val df = spark.createDataFrame(Seq(Foo(None), Foo(Some(1.0))))
df: org.apache.spark.sql.DataFrame = [d: double]

scala> df.as[Double].printSchema
root
 |-- d: double (nullable = true)

scala> df.as[Double].collect
java.lang.NullPointerException: Null value appeared in non-nullable field:
- root class: "scala.Double"
If the schema is inferred from a Scala tuple/case class, or a Java bean, please try to use scala.Option[_] or other nullable types (e.g. java.lang.Integer instead of int/scala.Int).
  at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificSafeProjection.apply(Unknown Source)
  at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collectFromPlan$1.apply(Dataset.scala:2864)
  at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$collectFromPlan$1.apply(Dataset.scala:2861)
  at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
  at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
  at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
  at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
  at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
  at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
  at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:2861)
  at org.apache.spark.sql.Dataset$$anonfun$collect$1.apply(Dataset.scala:2387)
  at org.apache.spark.sql.Dataset$$anonfun$collect$1.apply(Dataset.scala:2387)
  at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:2842)
  at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
  at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2841)
  at org.apache.spark.sql.Dataset.collect(Dataset.scala:2387)
  ... 48 elided

scala> df.as[Tuple1[Double]].printSchema
root
 |-- d: double (nullable = true)

scala> df.as[Tuple1[Double]].collect
res26: Array[(Double,)] = Array((-1.0,), (1.0,))

标签: apache-spark

解决方案


推荐阅读