loops - BigQuery 中的按数组循环和联合循环结果
问题描述
我写了一个脚本来执行以下操作
day_event
为需要清理方法 1 的条件 1 下的事件创建包含 eventId 的数组night_event
为需要清洁方法 2 的条件 2创建数组cross_day_event
为需要清洁方法 3 的条件 3创建数组day_event
使用方法 1循环查询并清理每个事件night_event
使用方法 2循环查询并清理每个事件cross_day_event
使用方法 3循环查询并清理每个事件
- 注意:清理后,一行将分解为几行- 查询原始数据但删除原始条目,并合并所有清理后的数据
我相信脚本可能会更干净,但我不知道如何。有什么建议么?谢谢!
declare day_event array<int64>;
declare night_event array<int64>;
declare cross_day_event array<int64>;
declare i int64 default 0;
declare j int64 default 0;
declare k int64 default 0;
-- query three arrays based on different conditions.
-- each condition needs a slightly different cleaning method.
set day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_1
);
set night_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_2
);
set cross_day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_3
);
-- array 1 and condition 1
create or replace temp table day_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;
loop
set i = i+1;
if i > array_length(day_event) then leave;
end if;
create or replace temp table day_event_clean as
-- here is the cleaning I need to do
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = day_event[ordinal(i)]) t cross join
unnest(generate_timestamp_array(timestamp(date(startTime_adj)), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)) dt
union all select * from day_event_clean;
-- end of cleaning
end loop;
-- array 2 and condition 2
create or replace temp table night_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;
loop
set j = j+1;
if j > array_length(night_event) then leave;
end if;
create or replace temp table night_event_clean as
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = night_event[ordinal(j)]) t cross join
unnest(generate_timestamp_array(timestamp_add(timestamp(date(startTime_adj)), interval 12 hour), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)) dt
union all select * from night_event_clean;
end loop;
-- array 3 and condition 3
create or replace temp table cross_day_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;
loop
set k = k+1;
if k > array_length(cross_day_event) then leave;
end if;
create or replace temp table cross_day_event_clean as
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = cross_day_event[ordinal(k)]) t cross join
unnest(generate_timestamp_array(timestamp_add(timestamp(date(startTime_adj)), interval 12 hour), timestamp(date(endTime_adj)), interval 12 hour)) dt
union all select * from cross_day_event_clean;
end loop;
--query the original data and union with all the cleaned data
select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId not in
(condition_1 or condition_2 or conditon_3) union all
select * from day_event_clean where eventId is not null union all
select * from night_event_clean where eventId is not null union all
select * from cross_day_event_clean where eventId is not null
order by startTime_adj;
解决方案
快速浏览脚本顶部的快速推荐 #1
所以下面的片段太冗长了,最重要的是查询同一个表 3 次 - 所以成本是应该的 3 倍!!!
-- query three arrays based on different conditions.
-- each condition needs a slightly different cleaning method.
set day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_1
);
set night_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_2
);
set cross_day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_3
);
而不是上面你应该使用下面的简单并且只需扫描你的表格脚本
set (day_event, night_event, cross_day_event) = (
select as struct
array_agg(if(condition_1, id, null) ignore nulls ) array1,
array_agg(if(condition_2, id, null) ignore nulls ) array2,
array_agg(if(condition_3, id, null) ignore nulls ) array3
from (
select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`
)
);
我希望上面的例子能让你在脚本的其余部分朝着正确的方向前进,因为你真的不需要像你所拥有的条件一样多次重复你的东西。你只需要应用类似于上述逻辑 - 这并不是那么简单 - 但在大多数情况下是相当可行的
另一个建议 - 在你的情况下使用循环是不合理的 - 你不需要循环只是为了遍历数组元素 - 你宁愿使用 unnest 函数以集合方式(sql方式 - 在一个查询中)
最后,如果您仍然需要帮助 - 我建议您简化循环逻辑的示例并将其作为单独的问题发布 - 否则它(当前问题)过于宽泛,无法回答和解决所有可能的改进(这相当多在这里制作)
更新
有一些额外的时间来跟进你的循环。
因此,让我们转换您的第一个循环
而不是脚本的以下片段
-- array 1 and condition 1
create or replace temp table day_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;
loop
set i = i+1;
if i > array_length(day_event) then leave;
end if;
create or replace temp table day_event_clean as
-- here is the cleaning I need to do
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = day_event[ordinal(i)]) t cross join
unnest(generate_timestamp_array(timestamp(date(startTime_adj)), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)) dt
union all select * from day_event_clean;
-- end of cleaning
end loop;
您只需使用一个简单的查询
create or replace temp table day_event_clean as
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj
from (
select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`
where eventId in unnest(day_event)
) t
cross join unnest(
generate_timestamp_array(timestamp(date(startTime_adj)), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)
) dt
它不仅不那么冗长且更易于维护 - 它可以为您节省 $$$,因为它不会event_raw_data
像相应数组中的元素一样多次扫描表 - 上面的查询只是一个!
相同的方法适用于其余两个循环
最后,在您应用上述所有内容并最终得到紧凑且易于管理的脚本之后 - 我真的希望您会看到整个原始 [巨大] 脚本可以作为一个相对简单的查询来实现 - 只是一个。正如我之前提到的 - 如果你会达到这一点并且仍然需要帮助 - 使用你将能够在那一刻得到的脚本发布新问题
祝你好运:o)
推荐阅读
- javascript - 如何将 Google 表格脚本代码启动到 Google 应用脚本网络应用中?
- reactjs - 如何使用带有反应钩子的 redux 状态
- bash - 使用grep打印存储在循环变量中的重叠字符串的字符位置?
- amazon-s3 - 阻止管理员通过控制台访问 S3 存储桶
- c++ - 默认模板类 lambda
- java - 在准备好的语句中批量运行多个查询
- azure - AzureMapsCodeSamples 无法在本地驱动器中运行
- c# - 如何使用 ICollection 属性进行布尔检查
- javascript - 将按钮添加到 $.dialog (jquery-confirm.js)
- android - 数组列表
> 在 Parcelable 对象中