google-bigquery - Bigquery:根据亲属表查找家族
问题描述
我有一个家庭关系数据库:
with example_data as(
SELECT 'abc' as relative_1, 'def' as relative_2
union all
SELECT 'abc' as relative_1, '123' as relative_2
union all
SELECT 'def' as relative_1, '334' as relative_2
union all
SELECT 'fdc' as relative_1, '123' as relative_2
union all
SELECT 'fgl' as relative_1, '342' as relative_2
)
我怎样才能根据这些数据创建完整的家族氏族,以便我收到这种结果:
我为创建所需输出而编写的代码似乎根本不实用,实际上,对于一个包含 100k 行输入的表,我的查询在第 5 次自加入后达到了 6 小时的限制。我不担心最终会导致整张桌子连成一条长线——我知道一个氏族中只有这么多的家庭成员。
或者,如果可以将结果作为嵌套表返回,并且list_all_relatives
作为重复字段在 relative_1 和相应的远亲之间具有最少的步数,那就太好了。
我在图像中返回结果的低效代码:
SELECT 'abc' as relative_1, 'def' as relative_2, 'abc' as list_0
union all
SELECT 'abc' as relative_1, '123' as relative_2, 'abc' as list_0
union all
SELECT 'def' as relative_1, '334' as relative_2, 'def' as list_0
union all
SELECT 'fdc' as relative_1, '123' as relative_2, 'fdc' as list_0
union all
SELECT 'fgl' as relative_1, '342' as relative_2, 'fgl' as list_0
)
,
step_0 as (
SELECT relative_1, relative_2,
ARRAY_TO_STRING(ARRAY(SELECT DISTINCT x FROM UNNEST(SPLIT(concat(relative_1,',',relative_2,',',list_0), ',')) AS x ORDER BY x), ',') AS combined_list
from raw_data
)
,
step_1 as (
SELECT relative_1, relative_2, list_1,
ARRAY_TO_STRING(ARRAY(SELECT DISTINCT x FROM UNNEST(SPLIT(concat(relative_1,',',relative_2,',',list_1), ',')) AS x ORDER BY x), ',') AS combined_list
from
step_0
left join
(select relative_2,combined_list as list_1 from step_0)
using(relative_2)
)
,
step_2 as (
SELECT distinct * except (combined_list,list_1),
ARRAY_TO_STRING(ARRAY(SELECT DISTINCT x FROM UNNEST(SPLIT(concat(combined_list,',',list_2), ',')) AS x ORDER BY x), ',') AS combined_list,
from
step_1
left join
(select relative_1,combined_list as list_2 from step_1)
using(relative_1)
)
,
step_3 as (
SELECT distinct * except (combined_list,list_2),
ARRAY_TO_STRING(ARRAY(SELECT DISTINCT x FROM UNNEST(SPLIT(concat(combined_list,',',list_3), ',')) AS x ORDER BY x), ',') AS combined_list,
from
step_2
left join
(select relative_2,combined_list as list_3 from step_2)
using(relative_2)
)
,
step_4 as (
SELECT distinct * except (combined_list,list_3),
ARRAY_TO_STRING(ARRAY(SELECT DISTINCT x FROM UNNEST(SPLIT(concat(combined_list,',',list_4), ',')) AS x ORDER BY x), ',') AS combined_list,
from
step_3
left join
(select relative_1,combined_list as list_4 from step_3)
using(relative_1)
),
step_N as (
SELECT *
from step_4
)
,
step_prefinal as (
SELECT distinct
relative_1,list_4, combined_list,
1+length(REGEXP_REPLACE(REGEXP_REPLACE(LOWER(combined_list), '[a-z]', ''),'[0-9]','')) as n_elements_in_list,
max(1+length(REGEXP_REPLACE(REGEXP_REPLACE(LOWER(combined_list), '[a-z]', ''),'[0-9]',''))) over (partition by relative_1) as longest_list_relatives
from step_N
)
,
step_final as (
SELECT relative_1, combined_list, n_elements_in_list,
count(*) over (partition by relative_1) as cnt_lists_per_relative,
max(n_elements_in_list) over (partition by relative_1) as max_elements
from
step_prefinal
where true
and longest_list_relatives = n_elements_in_list
group by 1,2,3
)
,
stats as (
SELECT cnt_lists_per_relative, count(distinct relative_1) as cnt,
max(max_elements) as max_elements
from
step_final
group by 1
order by 1
)
SELECT relative_1, combined_list as list_all_relatives
from step_final
where true
解决方案
我能够创建代码以更简单的方式重现您的输出。
我已经使用提供的 example_data 来重现它。我已经使用 UDF 和 JavaScript 实现了所需的输出。下面是代码:
#Custom UDF to return a list of strings from an array
CREATE TEMP FUNCTION
rel (relatives ARRAY<String>)
RETURNS string
LANGUAGE js AS '''
return relatives;
''';
#provided data
WITH example_data AS(
SELECT 'abc' AS relative_1, 'def' AS relative_2
UNION ALL
SELECT 'abc' AS relative_1, '123' AS relative_2
UNION ALL
SELECT 'def' AS relative_1, '334' AS relative_2
UNION ALL
SELECT 'fdc' AS relative_1, '123' AS relative_2
UNION ALL
SELECT 'fgl' AS relative_1, '342' AS relative_2
),
#Manipulating the data
data AS
(
SELECT
t2.key,
ARRAY_AGG(CAST(relative_2 AS string) ) AS relatives
FROM example_data t1
LEFT JOIN ( SELECT DISTINCT relative_1 AS key FROM example_data
GROUP BY relative_1) t2
ON key=relative_1
GROUP BY 1
ORDER BY key
)
#selecting the desired fields and using the UDF
SELECT key, rel(relatives) AS list_of_relatives FROM data
ORDER BY key
如您所见,第一步是创建一个UDF ,它接收 Strings 的嵌套字段,并简单地将它们返回到每个“ Key ”的字符串列表中。在声明example_data之后,在第 2 步中,必须执行一些数据操作。为了实现如下表:
如您所见,该表具有relative_1(命名为 key)和嵌套列中的亲属。
之后,选择所需的字段。这意味着,键和UDF为list_of_relatives,这是在第一步中编写的,输出如下:
推荐阅读
- sqlalchemy - 使用 sqlalchemy 插入逻辑
- haskell - 如何修复这种嵌套 fmap 仿函数的混乱?
- javascript - 有没有办法解构 vue 道具,所以我不必传递每一个道具?
- python - 有没有办法让特定类的所有子类在访问属性时执行一个函数?
- computer-vision - 在detectron2中过滤最小边界框
- javascript - TypeScript - 如何在 pdf 中的图像上放置 QR 码?
- text-justify - Android中的文本对齐功能
- json - 如何让 Eclipse 内部浏览器显示 json 文本?
- javascript - 需要评估元素的类并在代码不正确时循环代码
- python - 如何在枕头库中输入波斯语文本