sql - 使用嵌套行(类型 STRUCT)对表 SQL 进行重复数据删除
问题描述
我有一个可能重复行的 SQL 表(在 BigQuery 中)。该表有 20 多列,其中一些是嵌套的(数据类型“STRUCT”)。我想对表进行重复数据删除。
我不能简单地查询SELECT DISTINCT *
,因为我得到一个错误
查询错误:STRUCT 类型的列选项不能在 SELECT DISTINCT 中使用
到目前为止,我尝试根据某些列的哈希创建一个唯一 ID。我现在有了这个唯一 ID(称为sha256
),但我想不出一种只选择具有唯一哈希的行的方法。
我尝试了GROUP BY
,但它不适用于STRUCT
类型,我也尝试INNER JOIN
使用仅包含唯一散列的表,但我也得到重复。
作为参考,以下是数据集的 2 行示例:
{
"sha256": "un2k3TUtzwzmQMvxfrjztsh/A/GW3WWzV4U4CezqceA=",
"has_phone": true,
"options": {
"sub_toplist": true,
"gallery": false,
"urgent": false,
"has_option": true,
"photosup": true,
"booster": false
},
"calendar": {
"dates": null
},
"owner": {
"siren": null,
"pro_rates_link": null,
"user_id": "f0d94687-1a24-4ed4-8adb-7faded053ca8",
"type": "private",
"no_salesmen": true,
"name": "marius",
"store_id": "5022456"
},
"location": {
"feature": {
"properties": null,
"geometry": {
"coordinates": [
"9.41733",
"42.54701"
],
"type": "Point"
},
"type": "Feature"
},
"is_shape": true,
"provider": "here",
"lng": "9.41733",
"lat": "42.54701",
"zipcode": "20290",
"city_label": "Lucciana 20290",
"city": "Lucciana",
"region_name": "Corse",
"department_name": null,
"source": "city",
"department_id": "0",
"region_id": "9"
},
"attributes": {
"pro_rates_link": null,
"immo_sell_type": "old",
"ges": "a",
"square": "92",
"rooms": "4",
"energy_rate": "b",
"is_import": false,
"custom_ref": null,
"lease_type": "sell",
"real_estate_type": "1",
"fai_included": null,
"type_real_estate_sale": null
},
"price_calendar": null,
"price": [
"270000"
],
"body": "text",
"url": "https://www.example.fr/ventes_immobilieres/1729537955.htm",
"category_name": "Ventes immobilières",
"category_id": "9",
"images": {
"urls_thumb": [
"https://img3.example.fr/ad-thumb/d63e236ce3546906b3ce661640a7cf858d0a0593.jpg"
],
"urls": [
"https://img3.example.fr/ad-image/ac6bd9ce0cc3aa507727ddece51f437d77ae4cfa.jpg",
],
"nb_images": "7",
"small_url": "https://img3.example.fr/ad-small/ac6bd9ce0cc3aa507727ddece51f437d77ae4cfa.jpg",
"thumb_url": "https://img3.example.fr/ad-thumb/ac6bd9ce0cc3aa507727ddece51f437d77ae4cfa.jpg"
},
"ad_type": "offer",
"first_publication_date": "2020-01-02 15:00:46 UTC",
"status": "active",
"subject": "Villa à Lucciana",
"index_date": "2020-01-16 15:00:45 UTC",
"expiration_date": "2020-03-02 15:00:46 UTC",
"list_id": "1729537955"
},
{
"sha256": "wCMrggkqSJ3PgbkuWAgBpCMtFfkJDRlz6TOeO5Nngsg=",
"has_phone": true,
"options": {
"sub_toplist": false,
"gallery": false,
"urgent": false,
"has_option": false,
"photosup": false,
"booster": false
},
"calendar": {
"dates": null
},
"owner": {
"siren": null,
"pro_rates_link": null,
"user_id": "ae0f432d-0aa2-4828-a20b-3472255588b4",
"type": "private",
"no_salesmen": true,
"name": "M.Milleliri",
"store_id": "12132533"
},
"location": {
"feature": {
"properties": null,
"geometry": {
"coordinates": [
"9.1917",
"41.54506"
],
"type": "Point"
},
"type": "Feature"
},
"is_shape": true,
"provider": "here",
"lng": "9.1917",
"lat": "41.54506",
"zipcode": "20146",
"city_label": "Sotta 20146",
"city": "Sotta",
"region_name": "Corse",
"department_name": null,
"source": "city",
"department_id": "0",
"region_id": "9"
},
"attributes": {
"pro_rates_link": null,
"immo_sell_type": "old",
"ges": "Non renseigné",
"square": null,
"rooms": null,
"energy_rate": "Non renseigné",
"is_import": false,
"custom_ref": null,
"lease_type": "sell",
"real_estate_type": "3",
"fai_included": null,
"type_real_estate_sale": null
},
"price_calendar": null,
"price": [
"100000"
],
"body": "text",
"url": "https://www.example.fr/ventes_immobilieres/1736199673.htm",
"category_name": "Ventes immobilières",
"category_id": "9",
"images": {
"urls_thumb": [
"https://img3.example.fr/ad-thumb/4f3632dc8e5c50075aa6c6e4b559e2042546f009.jpg"
],
"urls": [
"https://img3.example.fr/ad-image/4f3632dc8e5c50075aa6c6e4b559e2042546f009.jpg"
],
"urls_large": [
"https://img3.example.fr/ad-large/4f3632dc8e5c50075aa6c6e4b559e2042546f009.jpg"
],
"nb_images": "1",
"small_url": "https://img3.example.fr/ad-small/4f3632dc8e5c50075aa6c6e4b559e2042546f009.jpg",
"thumb_url": "https://img3.example.fr/ad-thumb/4f3632dc8e5c50075aa6c6e4b559e2042546f009.jpg"
},
"ad_type": "offer",
"first_publication_date": "2020-01-16 14:21:05 UTC",
"status": "active",
"subject": "Terrain 1250 m2 Sotta",
"index_date": "2020-01-16 14:21:05 UTC",
"expiration_date": "2020-03-16 14:21:05 UTC",
"list_id": "1736199673"
}
and the query I'm working on so far:
WITH
table_unique_hash AS (
SELECT
DISTINCT(SHA256(CONCAT(FORMAT_TIMESTAMP('%Y/%m/%d_%H:%M:%S_', index_date), CAST(list_id AS STRING)))) AS sha256
FROM
`test_bucket_data.daily_table`),
table_hashed AS (
SELECT
SHA256(CONCAT(FORMAT_TIMESTAMP('%Y/%m/%d_%H:%M:%S_', index_date), CAST(list_id AS STRING))) AS sha256, *
FROM
`test_bucket_data.daily_table`)
SElECT * FROM table_hashed
limit 10;
A solution would be to find a way of inner join the table_hashed
and the table_unique_hash
on the sha256 column...
Thanks for your help!
解决方案
I found a workaround based on this topic. The combination of GROUP BY
and ANY
function for all the STRUCT columns made it work!
SELECT
has_phone,
ANY_VALUE(options) as options,
ANY_VALUE(calendar) as calendar,
ANY_VALUE(owner) as owner,
ANY_VALUE(location) as location,
ANY_VALUE(attributes) as attributes,
price_calendar,
price,
body,
url,
category_name,
category_id,
ANY_VALUE(images) as images,
ad_type,
first_publication_date,
status,
subject,
index_date,
expiration_date,
list_id,
FROM
`{table_name}`
Group by
has_phone,
price_calendar,
price,
body,
url,
category_name,
category_id,
ad_type,
first_publication_date,
status,
subject,
index_date,
expiration_date,
list_id
Note: my "price" field was previously an array
; I transformed it in my source json to an int
推荐阅读
- django - Django 在退出视图或导航到与当前 URL 不同的 URL 时运行函数
- c - 查找数组中的哪些值至少出现 x 次
- d3.js - 在 D3 中可视化来自不同 csv 文件的数据
- html - 页面内容不会在具有 RTL 属性的移动视图中显示为全宽
- jquery - 在“422(无法处理的实体)”Laravel7 中验证错误时出现错误
- javascript - 未捕获的 ReferenceError:未定义 mahi
- python - 如何在临时变量中设置一个大的 Numpy 来编写视频?
- google-cloud-platform - 无法从文件“/root/.google_libcloud_auth.gcp-cm-tool”中读取缓存的身份验证令牌
- azure-sqldw - Azure Synapse:将一个 n 长度的分隔列表列转换为 n 个不同的列
- python - 将 JSON 数据加载到 csv Spotify 相关艺术家