首页 > 解决方案 > 在火花中如何从结构中抽象出数组并用它创建一个新字段

问题描述

我有一个带有架构的结构:

root
 |-- id: long (nullable = true)
 |-- products: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: integer (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- created_at: long (nullable = true)
 |    |    |-- updated_at: long (nullable = true)
 |    |    |-- product_color: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- id: integer (nullable = true)
 |    |    |    |    |-- color: string (nullable = true)
 |    |    |    |    |-- created_at: long (nullable = true)
 |    |    |    |    |-- updated_at: long (nullable = true)
 |    |    |    |    |-- products_id: long (nullable = true)
 |    |    |-- orders_id: long (nullable = true)

现在,我想用 product_color 创建一个新列,所以在我的数据框中我添加了一个新列,例如

  df.withColumn("product_color", col(currentNode + "." + fieldName))

使用新列的架构:

root
 |-- id: long (nullable = true)
 |-- products: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: integer (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- created_at: long (nullable = true)
 |    |    |-- updated_at: long (nullable = true)
 |    |    |-- product_color: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- id: integer (nullable = true)
 |    |    |    |    |-- color: string (nullable = true)
 |    |    |    |    |-- created_at: long (nullable = true)
 |    |    |    |    |-- updated_at: long (nullable = true)
 |    |    |    |    |-- products_id: long (nullable = true)
 |    |    |-- orders_id: long (nullable = true)
 |-- product_color: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: integer (nullable = true)
 |    |    |    |-- color: string (nullable = true)
 |    |    |    |-- created_at: long (nullable = true)
 |    |    |    |-- updated_at: long (nullable = true)
 |    |    |    |-- products_id: long (nullable = true)

如果您查看 product_color 的架构,则会添加数组元素。

|-- element: array (containsNull = true)

我正在寻求帮助以了解如何使用产品结构内部的确切架构创建新列。

预期架构:

root
 |-- id: long (nullable = true)
 |-- products: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: integer (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- created_at: long (nullable = true)
 |    |    |-- updated_at: long (nullable = true)
 |    |    |-- product_color: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- id: integer (nullable = true)
 |    |    |    |    |-- color: string (nullable = true)
 |    |    |    |    |-- created_at: long (nullable = true)
 |    |    |    |    |-- updated_at: long (nullable = true)
 |    |    |    |    |-- products_id: long (nullable = true)
 |    |    |-- orders_id: long (nullable = true)
 |-- product_color: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: integer (nullable = true)
 |    |    |-- color: string (nullable = true)
 |    |    |-- created_at: long (nullable = true)
 |    |    |-- updated_at: long (nullable = true)
 |    |    |-- products_id: long (nullable = true)

Spark:2.4.5 语言:Scala

标签: apache-sparkapache-spark-sql

解决方案


在添加时new column 分解数组以获得您想要的架构。

Example:

//sample df schema

df.printSchema
//root
// |-- id: long (nullable = true)
// |-- products: array (nullable = true)
// |    |-- element: struct (containsNull = true)
// |    |    |-- id: long (nullable = true)
// |    |    |-- order_id: long (nullable = true)
// |    |    |-- product_color: array (nullable = true)
// |    |    |    |-- element: struct (containsNull = true)
// |    |    |    |    |-- color: string (nullable = true)
// |    |    |    |    |-- id: long (nullable = true)
// |    |    |    |    |-- products_id: long (nullable = true)

df.withColumn("product_color",explode(col("products.product_color"))).printSchema
//root
// |-- id: long (nullable = true)
// |-- products: array (nullable = true)
// |    |-- element: struct (containsNull = true)
// |    |    |-- id: long (nullable = true)
// |    |    |-- order_id: long (nullable = true)
// |    |    |-- product_color: array (nullable = true)
// |    |    |    |-- element: struct (containsNull = true)
// |    |    |    |    |-- color: string (nullable = true)
// |    |    |    |    |-- id: long (nullable = true)
// |    |    |    |    |-- products_id: long (nullable = true)
// |-- product_color: array (nullable = true)
// |    |-- element: struct (containsNull = true)
// |    |    |-- color: string (nullable = true)
// |    |    |-- id: long (nullable = true)
// |    |    |-- products_id: long (nullable = true)

推荐阅读