首页 > 解决方案 > 展平嵌套的 json 文件

问题描述

示例 json 文件

{"Response":{"MetaInfo":{"Timestamp":"2019-11-11T11:25:16.303+0000","NextPageInformation":"2"},"View":[{"_type":"SearchResultsViewType","ViewId":0,"Result":[{"Relevance":1.0,"Distance":4.8,"MatchLevel":"street","MatchQuality":{"Country":1.0,"State":1.0,"County":1.0,"City":1.0,"District":1.0,"Subdistrict":1.0,"PostalCode":1.0},"Location":{"LocationId":"NT_cu4rBChIN2x48NzUtd2.zB_l_715011352_R","LocationType":"point","DisplayPosition":{"Latitude":28.5579732,"Longitude":77.2870505},"MapView":{"TopLeft":{"Latitude":28.55798,"Longitude":77.28678},"BottomRight":{"Latitude":28.55797,"Longitude":77.28762}},"Address":{"Label":"Noor Nagar-Ajmal Bagh, Jamia Nagar, Delhi 110025, India","Country":"IND","State":"DL","County":"South","City":"Delhi","District":"Jamia Nagar","Subdistrict":"Noor Nagar-Ajmal Bagh","PostalCode":"110025","AdditionalData":[{"value":"India","key":"CountryName"},{"value":"Delhi","key":"StateName"},{"value":"South","key":"CountyName"}]},"MapReference":{"ReferenceId":"715011352","Spot":0.32,"SideOfStreet":"right","CountryId":"22806254","StateId":"22803332","CountyId":"22803085","DistrictId":"22803433"}}}

从这里我想得到地址字段。

我可以获取到位置但不能获取地址

pd.io.json.json_normalize(data,['Response','View','Result']).pipe(
                    lambda x: x.drop('Location', 1).join(
                        x.Location.apply(lambda y: pd.Series(merge(y)))
                    )
                )

标签: jsonflatten

解决方案


使用 scala spark,您可以递归地展平 json:

import org.apache.spark.sql.{ Row, SaveMode, SparkSession, DataFrame }
def recurs(df: DataFrame): DataFrame = {
  if(df.schema.fields.find(_.dataType match {
    case ArrayType(StructType(_),_) | StructType(_) => true
    case _ => false
  }).isEmpty) df
  else {
    val columns = df.schema.fields.map(f => f.dataType match {
      case _: ArrayType => explode(col(f.name)).as(f.name)
      case s: StructType => col(s"${f.name}.*")
      case _ => col(f.name)
    })
    recurs(df.select(columns:_*))
  }
}
val df = spark.read.json(json_location)
flatten_df = recurs(df)
flatten_df.show()

推荐阅读