首页 > 解决方案 > BigQuery 中的按数组循环和联合循环结果

问题描述

我写了一个脚本来执行以下操作

我相信脚本可能会更干净,但我不知道如何。有什么建议么?谢谢!

declare day_event array<int64>;
declare night_event array<int64>;
declare cross_day_event array<int64>;
declare i int64 default 0;
declare j int64 default 0;
declare k int64 default 0;

-- query three arrays based on different conditions. 
-- each condition needs a slightly different cleaning method.
set day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_1
);
set night_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_2
);
set cross_day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_3
);

-- array 1 and condition 1
create or replace temp table day_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;

loop 
set i = i+1;
if i > array_length(day_event) then leave;
end if;
create or replace temp table day_event_clean as
-- here is the cleaning I need to do
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from 
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = day_event[ordinal(i)]) t cross join
unnest(generate_timestamp_array(timestamp(date(startTime_adj)), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)) dt
union all select * from day_event_clean;
-- end of cleaning
end loop;

-- array 2 and condition 2
create or replace temp table night_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;

loop
set j = j+1;
if j > array_length(night_event) then leave;
end if;
create or replace temp table night_event_clean as
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from 
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = night_event[ordinal(j)]) t cross join
unnest(generate_timestamp_array(timestamp_add(timestamp(date(startTime_adj)), interval 12 hour), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)) dt
union all select * from night_event_clean;
end loop;

-- array 3 and condition 3
create or replace temp table cross_day_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;

loop
set k = k+1;
if k > array_length(cross_day_event) then leave;
end if;
create or replace temp table cross_day_event_clean as 
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from 
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = cross_day_event[ordinal(k)]) t cross join
unnest(generate_timestamp_array(timestamp_add(timestamp(date(startTime_adj)), interval 12 hour), timestamp(date(endTime_adj)), interval 12 hour)) dt
union all select * from cross_day_event_clean;
end loop;

--query the original data and union with all the cleaned data
select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId not in
(condition_1 or condition_2 or conditon_3) union all
select * from day_event_clean where eventId is not null union all
select * from night_event_clean where eventId is not null union all
select * from cross_day_event_clean where eventId is not null
order by startTime_adj;

标签: loopsgoogle-bigquerycode-cleanup

解决方案


快速浏览脚本顶部的快速推荐 #1

所以下面的片段太冗长了,最重要的是查询同一个表 3 次 - 所以成本是应该的 3 倍!!!

-- query three arrays based on different conditions. 
-- each condition needs a slightly different cleaning method.
set day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_1
);
set night_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_2
);
set cross_day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_3
);    

而不是上面你应该使用下面的简单并且只需扫描你的表格脚本

set (day_event, night_event, cross_day_event) = (
  select as struct
    array_agg(if(condition_1, id, null) ignore nulls ) array1,
    array_agg(if(condition_2, id, null) ignore nulls ) array2,
    array_agg(if(condition_3, id, null) ignore nulls ) array3
  from (
    select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
    from `event_raw_data` 
  )
);   

我希望上面的例子能让你在脚本的其余部分朝着正确的方向前进,因为你真的不需要像你所拥有的条件一样多次重复你的东西。你只需要应用类似于上述逻辑 - 这并不是那么简单 - 但在大多数情况下是相当可行的

另一个建议 - 在你的情况下使用循环是不合理的 - 你不需要循环只是为了遍历数组元素 - 你宁愿使用 unnest 函数以集合方式(sql方式 - 在一个查询中)

最后,如果您仍然需要帮助 - 我建议您简化循环逻辑的示例并将其作为单独的问题发布 - 否则它(当前问题)过于宽泛,无法回答和解决所有可能的改进(这相当多在这里制作)

更新

有一些额外的时间来跟进你的循环。
因此,让我们转换您的第一个循环
而不是脚本的以下片段

-- array 1 and condition 1
create or replace temp table day_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;

loop 
set i = i+1;
if i > array_length(day_event) then leave;
end if;
create or replace temp table day_event_clean as
-- here is the cleaning I need to do
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from 
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = day_event[ordinal(i)]) t cross join
unnest(generate_timestamp_array(timestamp(date(startTime_adj)), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)) dt
union all select * from day_event_clean;
-- end of cleaning
end loop;   

您只需使用一个简单的查询

create or replace temp table day_event_clean as
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj 
from (
  select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj 
  from `event_raw_data` 
  where eventId in unnest(day_event)
) t 
cross join unnest(
  generate_timestamp_array(timestamp(date(startTime_adj)), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)
) dt     

它不仅不那么冗长且更易于维护 - 它可以为您节省 $$$,因为它不会event_raw_data像相应数组中的元素一样多次扫描表 - 上面的查询只是一个!

相同的方法适用于其余两个循环

最后,在您应用上述所有内容并最终得到紧凑且易于管理的脚本之后 - 我真的希望您会看到整个原始 [巨大] 脚本可以作为一个相对简单的查询来实现 - 只是一个。正如我之前提到的 - 如果你会达到这一点并且仍然需要帮助 - 使用你将能够在那一刻得到的脚本发布新问题

祝你好运:o)


推荐阅读