首页 > 解决方案 > 有没有使这个查询脚本更有效?

问题描述

我的任务是创建一个查询,给定一个表,它将对每 20 行进行一次批处理聚合,并生成一个包含给定表 5% 行的新表。因此,如果给定一个 100 行的表,则新表将是 5 行,其中每行是行的聚合:[1-20]、[21-40]、[41-60]、[61-80] ,[81-100]。

此任务的目的是确定特定流程的“周期”,其中每个周期是一个单独的列“Cycle_X”,其中“X”是从 1 到 12 的数字。每个“Cycle_X”列都填充有一个从 0 开始的数字到 4 表示优先级;0 表示不优先。每个“Cycle”都应该是连续的,因此“Cycle_1”在“Cycle_2”之前,在“Cycle_3”之前等等。我的查询确定每个小批量的周期的方式是通过在批次中选择“Cycle_X”列最小的零个数。希望有助于澄清我上面解释的内容,

我已经在超过 80k 行的数据集上测试了我的查询,但由于连接超时,它永远无法完成,现在我的数据集超过一百万行。鉴于目前的情况,我正在远程工作,所以我认为远程连接不够稳定,无法完成该过程,但由于我不确定这里是确切的错误消息:

Msg 121, Level 20, State 0, Line 0
从服务器接收结果时发生传输级错误。(提供者:TCP 提供者,错误:0 - 信号量超时期限已过。)

我首先创建的查询为批处理聚合创建了一个基表,并为批处理聚合创建了一个空表。在声明和设置一些变量之后,有一个 while 循环使用 INSERT INTO 语句一次填充最初的空表一行(来自基表的 20 行的最小批次中的一行)。使用大量 CASE WHEN 语句确定各个周期的最小零计数。完整的查询可以在下面找到,我将不胜感激任何有助于提高效率的帮助。到目前为止,我所做的唯一更改是将表创建和 while 循环分离到单独的查询中。

USE _dataBase

-- Create a temp table to perform batch aggregation on
CREATE TABLE dbo.baseTable
WITH (DISTRIBUTION = ROUND_ROBIN) AS
SELECT
    ROW_NUMBER() OVER (ORDER BY _time) AS idx,
    _time,
    CASE WHEN Cycle_1 = 0 THEN 1 ELSE 0 END AS z1, --logical index that detetcts zero and not zero
    CASE WHEN Cycle_2 = 0 THEN 1 ELSE 0 END AS z2,
    CASE WHEN Cycle_3 = 0 THEN 1 ELSE 0 END AS z3,
    CASE WHEN Cycle_4 = 0 THEN 1 ELSE 0 END AS z4,
    CASE WHEN Cycle_5 = 0 THEN 1 ELSE 0 END AS z5,
    CASE WHEN Cycle_6 = 0 THEN 1 ELSE 0 END AS z6,
    CASE WHEN Cycle_7 = 0 THEN 1 ELSE 0 END AS z7,
    CASE WHEN Cycle_8 = 0 THEN 1 ELSE 0 END AS z8,
    CASE WHEN Cycle_9 = 0 THEN 1 ELSE 0 END AS z9,
    CASE WHEN Cycle_10 = 0 THEN 1 ELSE 0 END AS z10
FROM dbo.srcTable
WHERE _date = 20200423 --this must be parameterised

-- Create a table to store the batch aggregation (The table name should be parameterised somehow)
CREATE TABLE dbo.aggTable
(
    startTime datetime NOT NULL,
    endTime datetime NOT NULL,
    sum1 int NOT NULL,
    sum2 int NOT NULL,
    sum3 int NOT NULL,
    sum4 int NOT NULL,
    sum5 int NOT NULL,
    sum6 int NOT NULL,
    sum7 int NOT NULL,
    sum8 int NOT NULL,
    sum9 int NOT NULL,
    sum10 int NOT NULL,
    cycle int NOT NULL
)
--Declare variables
DECLARE
    @maxIter int, --The amount of times the loop runs
    @numRows int, --Number of rows in the temp table
    @batch int, --The size of the batch aggregation
    @offSet int, --The stat row of a batch that is updated during the loop
    @limit int, --The end row of a batch that is updated during the loop
    @iter int --Loop counter
--Set variables
SET @numRows = (SELECT COUNT(idx) FROM dbo.baseTable)
SET @maxIter = @numRows / 20
SET @batch = 20
SET @offSet = 1
SET @limit = 20
SET @iter = 0

--The loop
WHILE @iter < @maxIter
BEGIN
    --Append to this table
    INSERT INTO dbo.aggTable
    (
        startTime,
        endTime,
        sum1,
        sum2,
        sum3,
        sum4,
        sum5,
        sum6,
        sum7,
        sum8,
        sum9,
        sum10,
        cycle
    )
    SELECT
        MIN(_time) AS startTime,
        MAX(_time) AS endTime,
        SUM(z1) AS sum1, --this counts the number of zeroes for Cycle_1
        SUM(z2) AS sum2,
        SUM(z3) AS sum3,
        SUM(z4) AS sum4,
        SUM(z5) AS sum5,
        SUM(z6) AS sum6,
        SUM(z7) AS sum7,
        SUM(z8) AS sum8,
        SUM(z9) AS sum9,
        SUM(z10) AS sum10,
        --this case when clause is used to determine the Cycle basd on the minimum count of zeroes
        CASE
            WHEN SUM(z1) <= SUM(z2) AND SUM(z1) <= SUM(z3) AND SUM(z1) <= SUM(z4) AND SUM(z1) <= SUM(z5) AND SUM(z1) <= SUM(z6) AND SUM(z1) <= SUM(z7) AND SUM(z1) <= SUM(z8) AND SUM(z1) <= SUM(z9) AND SUM(z1) <= SUM(z10) THEN 1
            WHEN SUM(z2) <= SUM(z1) AND SUM(z2) <= SUM(z3) AND SUM(z2) <= SUM(z4) AND SUM(z2) <= SUM(z5) AND SUM(z2) <= SUM(z6) AND SUM(z2) <= SUM(z7) AND SUM(z2) <= SUM(z8) AND SUM(z2) <= SUM(z9) AND SUM(z2) <= SUM(z10) THEN 2
            WHEN SUM(z3) <= SUM(z1) AND SUM(z3) <= SUM(z2) AND SUM(z3) <= SUM(z4) AND SUM(z3) <= SUM(z5) AND SUM(z3) <= SUM(z6) AND SUM(z3) <= SUM(z7) AND SUM(z3) <= SUM(z8) AND SUM(z3) <= SUM(z9) AND SUM(z3) <= SUM(z10) THEN 3
            WHEN SUM(z4) <= SUM(z1) AND SUM(z4) <= SUM(z2) AND SUM(z4) <= SUM(z3) AND SUM(z4) <= SUM(z5) AND SUM(z4) <= SUM(z6) AND SUM(z4) <= SUM(z7) AND SUM(z4) <= SUM(z8) AND SUM(z4) <= SUM(z9) AND SUM(z4) <= SUM(z10) THEN 4
            WHEN SUM(z5) <= SUM(z1) AND SUM(z5) <= SUM(z2) AND SUM(z5) <= SUM(z3) AND SUM(z5) <= SUM(z4) AND SUM(z5) <= SUM(z6) AND SUM(z5) <= SUM(z7) AND SUM(z5) <= SUM(z8) AND SUM(z5) <= SUM(z9) AND SUM(z5) <= SUM(z10) THEN 5
            WHEN SUM(z6) <= SUM(z1) AND SUM(z6) <= SUM(z2) AND SUM(z6) <= SUM(z3) AND SUM(z6) <= SUM(z4) AND SUM(z6) <= SUM(z5) AND SUM(z6) <= SUM(z7) AND SUM(z6) <= SUM(z8) AND SUM(z6) <= SUM(z9) AND SUM(z6) <= SUM(z10) THEN 6
            WHEN SUM(z7) <= SUM(z1) AND SUM(z7) <= SUM(z2) AND SUM(z7) <= SUM(z3) AND SUM(z7) <= SUM(z4) AND SUM(z7) <= SUM(z5) AND SUM(z7) <= SUM(z6) AND SUM(z7) <= SUM(z8) AND SUM(z7) <= SUM(z9) AND SUM(z7) <= SUM(z10) THEN 7
            WHEN SUM(z8) <= SUM(z1) AND SUM(z8) <= SUM(z2) AND SUM(z8) <= SUM(z3) AND SUM(z8) <= SUM(z4) AND SUM(z8) <= SUM(z5) AND SUM(z8) <= SUM(z6) AND SUM(z8) <= SUM(z7) AND SUM(z8) <= SUM(z9) AND SUM(z8) <= SUM(z10) THEN 8
            WHEN SUM(z9) <= SUM(z1) AND SUM(z9) <= SUM(z2) AND SUM(z9) <= SUM(z3) AND SUM(z9) <= SUM(z4) AND SUM(z9) <= SUM(z5) AND SUM(z9) <= SUM(z6) AND SUM(z9) <= SUM(z7) AND SUM(z9) <= SUM(z8) AND SUM(z9) <= SUM(z10) THEN 9
            ELSE 10 END AS cycle
    FROM dbo.baseTable
    WHERE rowNum >= @offSet --row number of start of batch
    AND rowNum <= @limit --row number of end of batch
    --Update loop variables
    SET @offSet = @offSet + @batch
    SET @limit = @limit + @batch
    SET @iter = @iter + 1
END

--drop the temp table
DROP TABLE dbo.baseTable
--view the aggregated data
SELECT *
FROM dbo.aggTable
ORDER BY startTime

请让我知道是否需要进一步澄清或者这个问题是否不合适。

标签: sql-server

解决方案


如果没有样本数据,这很难做到。但是,看看这对你有什么作用:

如果这不明显,则未经测试

declare @date date = '2020-04-23'
declare @srcTable table (_time datetime, cycle_1 int, cycle_2 int, cycle_3 int
                        ,cycle_4 int, cycle_5 int, cycle_6 int, cycle_7 int
                        ,cycle_8 int, cycle_9 int, cycle_10 int, _date date)



insert into @srcTable
select getdate(), 0, 0, 2, 1, 0, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 1, 1, 2, 1, 0, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 1, 2, 0, 0, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 2, 1, 0, 1, 0, 9, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 0, 0, 2, 0, 0, 9, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 4, 1, 0, 1, 0, 9, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 5, 1, 2, 1, 9, 9, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 2, 1, 2, 1, 0, 0, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 0, 1, 0, 1, 0, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 0, 2, 1, 0, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 5, 1, 2, 1, 9, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 1, 2, 1, 9, 9, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 0, 0, 1, 9, 9, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 2, 1, 2, 0, 9, 9, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 1, 1, 2, 0, 9, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 1, 2, 0, 0, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 3, 1, 2, 0, 0, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 4, 0, 0, 1, 0, 0, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 0, 1, 2, 1, 0, 0, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 0, 1, 2, 1, 9, 0, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 6, 1, 2, 1, 9, 0, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 5, 1, 2, 1, 9, 0, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 0, 0, 2, 0, 9, 0, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 0, 1, 0, 0, 9, 0, 2, 3, 4, 9, '2020-04-23'
union all
select getdate(), 3, 2, 1, 0, 9, 2, 3, 4, 0, 2, '2020-04-23'
union all
select getdate(), 0, 0, 2, 1, 9, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 1, 2, 1, 9, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 1, 2, 1, 9, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 1, 0, 1, 0, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 2, 0, 2, 1, 0, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 1, 2, 0, 9, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 1, 2, 0, 9, 0, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 4, 0, 0, 0, 9, 9, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 0, 2, 1, 9, 9, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 1, 2, 1, 9, 9, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 5, 0, 2, 1, 9, 9, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 1, 0, 1, 9, 9, 2, 3, 4, 0, '2020-04-23'
union all
select getdate(), 0, 1, 2, 1, 9, 9, 2, 3, 4, 0, '2020-04-23'



select *
    ,(select min(cycle) --what a convoluted way to get the minimum cycle number that equals the maximum number of zero values
        from (values (z1, 1), (z2, 2), (z3, 3), (z4, 4), (z5, 5), (z6, 6), (z7, 7), (z8, 8), (z9, 9), (z10, 10)) AS value(amt, cycle)
        inner join (select (select max(amt) FROM (VALUES (z1), (z2), (z3), (z4), (z5), (z6), (z7), (z8), (z9), (z10)) AS value(amt)) amt) maxamt on maxamt.amt = value.amt) cycle
from (
    select min(_time) startTime, max(_time) endTime -- get first level of aggregated information
        ,sum(CASE WHEN Cycle_1 = 0 THEN 1 ELSE 0 END) AS z1
        ,sum(CASE WHEN Cycle_2 = 0 THEN 1 ELSE 0 END) AS z2
        ,sum(CASE WHEN Cycle_3 = 0 THEN 1 ELSE 0 END) AS z3
        ,sum(CASE WHEN Cycle_4 = 0 THEN 1 ELSE 0 END) AS z4
        ,sum(CASE WHEN Cycle_5 = 0 THEN 1 ELSE 0 END) AS z5
        ,sum(CASE WHEN Cycle_6 = 0 THEN 1 ELSE 0 END) AS z6
        ,sum(CASE WHEN Cycle_7 = 0 THEN 1 ELSE 0 END) AS z7
        ,sum(CASE WHEN Cycle_8 = 0 THEN 1 ELSE 0 END) AS z8
        ,sum(CASE WHEN Cycle_9 = 0 THEN 1 ELSE 0 END) AS z9
        ,sum(CASE WHEN Cycle_10 = 0 THEN 1 ELSE 0 END) AS z10
    from (
        select *, ROW_NUMBER() OVER (ORDER BY _time) - 1 AS idx -- this gives us idx to aggregate on
        from @srcTable
        where _date = @date
    ) idx
    group by idx.idx / 20
) a

推荐阅读