首页 > 解决方案 > 为某个子群体创建持续注册孤岛

问题描述

CLIENTID我的最终目标是为单个子群体创建连续注册天数的孤岛:2019 年和 2020 日历年的“成人扩展”。ACLIENTID可以与日历年中的多个子群体相关联,但永远不能相关联一次有多个亚群(招生没有重叠)。我的数据可以追溯到 2016 年,但我只对 2019 年和 2020 年感兴趣。数据的结构是,每一行都是一个注册期,有注册的开始日期和结束日期,与一个子群体相关联。

我在下面包含了一些虚拟数据和所需的输出,以更好地说明我的目标:

CREATE TABLE #t (
CLIENTID NVARCHAR(9),
DEMONSTRATION_POPULATION NVARCHAR(30),
ELIGBEGIN DATE,
ELIGEND DATE, 
AGE INT
)
INSERT INTO #t
VALUES 
('123456789', 'Adult Expansion', '2019-12-16', '2019-12-31', 52)
, ('123456789', 'Adult Expansion', '2020-01-01', '2020-01-15', 52)
, ('123456789', 'Adult Expansion', '2020-03-01', '2020-03-31', 52)
, ('123456789', 'Adult Expansion', '2020-04-01', '2020-04-30', 52)
, ('123456789', 'Adult Expansion', '2020-05-01', '2020-05-31', 52)
, ('123456789', 'Adult Expansion', '2020-06-01', '2020-06-30', 52)
, ('123456789', 'Adult Expansion', '2020-07-01', '2020-07-31', 52)
, ('123456789', 'Adult Expansion', '2020-08-01', '2020-08-31', 52)
, ('123456789', 'Adult Expansion', '2020-09-01', '2020-09-30', 52)
, ('123456789', 'Adult Expansion', '2020-10-01', '2020-10-31', 52)
, ('123456789', 'Adult Expansion', '2020-11-01', '2020-11-30', 52)
, ('123456789', 'Adult Expansion', '2020-12-01', '2020-12-31', 52)
------------------------NEW CLIENTID-----------------------------
,('012345678', 'Demonstration Population #3', '2019-10-01', '2019-10-31', 52)
,('012345678', 'Demonstration Population #3', '2019-11-01', '2019-11-30', 52)
,('012345678', 'Demonstration Population #3', '2019-12-01', '2019-12-31', 52)
,('012345678', 'Demonstration Population #3', '2020-01-01', '2020-01-31', 52)
,('012345678', 'Adult Expansion', '2020-02-01', '2020-02-28', 52)
,('012345678', 'Demonstration Population #3', '2020-02-29', '2020-02-29', 52)
,('012345678', 'Adult Expansion', '2020-03-01', '2020-03-31', 52)
,('012345678', 'Adult Expansion', '2020-04-01', '2020-04-30', 52)
,('012345678', 'Adult Expansion', '2020-05-01', '2020-05-31', 52)
,('012345678', 'Adult Expansion', '2020-06-01', '2020-06-30', 52)
,('012345678', 'Adult Expansion', '2020-07-01', '2020-07-31', 52)
,('012345678', 'Adult Expansion', '2020-08-01', '2020-08-31', 52)
,('012345678', 'Adult Expansion', '2020-09-01', '2020-09-30', 52)
,('012345678', 'Adult Expansion', '2020-10-01', '2020-10-31', 52)
,('012345678', 'Adult Expansion', '2020-11-01', '2020-11-30', 52)
,('012345678', 'Adult Expansion', '2020-12-01', '2020-12-31', 52)
---------------------------NEW CLIENTID---------------------------
,('020234587', 'Adult Expansion',   '2019-06-01', '2019-06-30', 36)
,('020234587', 'Adult Expansion',   '2019-08-01', '2019-08-31', 36)
,('020234587', 'Adult Expansion',   '2019-09-01', '2019-09-30', 36)
,('020234587', 'Adult Expansion',   '2019-10-01', '2019-10-31', 36)
,('020234587', 'Adult Expansion',   '2019-11-01', '2019-11-30', 36)
,('020234587', 'Non-1115-Waiver',   '2019-12-01', '2019-12-31', 36)
,('020234587', 'Non-1115-Waiver',   '2020-01-01', '2020-01-31', 36)
,('020234587', 'Non-1115-Waiver',   '2020-02-01', '2020-02-29', 36)

期望的输出:

客户编号 年龄 开始 精英 序列号
123456789 52 2019-12-19 2019-12-31 1
123456789 52 2020-01-01 2020-01-15 2
123456789 52 2020-03-01 2020-12-31 3
012345678 52 2020-02-01 2020-28-20 1
012345678 52 2020-03-01 2020-03-31 2
020234587 36 2019-06-01 2019-06-30 1
020234587 36 2019-08-01 2019-11-30 2

然后这是我用来尝试解决这个问题的当前代码。

SELECT * INTO #y1 FROM #t
WHERE YEAR(ELIGBEGIN) = '2019'

SELECT s1.CLIENTID
       , s1.AGE
       , CAST(s1.ELIGBEGIN AS DATETIME)    AS ELIGBEGIN
       , MIN(CAST(t1.ELIGEND AS DATETIME)) AS ELIGEND
       , ROW_NUMBER() OVER(PARTITION BY s1.CLIENTID ORDER BY CAST(s1.ELIGBEGIN AS DATETIME)) AS Sequence_ID
INTO #CY19
FROM --[dbo].[Eligs] 
 --#t s1
 #y1 s1
INNER JOIN #y1--#t
t1 ON t1.CLIENTID = s1.CLIENTID
                   AND CAST(s1.ELIGBEGIN AS DATETIME) <= CAST(t1.ELIGEND AS DATETIME)
                   AND t1.DEMONSTRATION_POPULATION = 'Adult Expansion'
    AND NOT EXISTS
    (
    SELECT * FROM #y1 t2--#t t2
                 WHERE t2.CLIENTID = t1.CLIENTID
                   AND (CAST(t1.ELIGEND AS DATETIME) + 1) >= CAST(t2.ELIGBEGIN AS DATETIME)
                         AND CAST(t1.ELIGEND AS DATETIME) < CAST(t2.ELIGEND AS DATETIME)
                   )
WHERE --s1.DEMONSTRATION_POPULATION = 'Adult Expansion' AND 
    NOT EXISTS
    (
    SELECT * FROM #y1 s2--#t s2
                 WHERE s2.CLIENTID = s1.CLIENTID
                   AND CAST(s1.ELIGBEGIN AS DATETIME)       > CAST(s2.ELIGBEGIN AS DATETIME)
                   AND (CAST(s1.ELIGBEGIN AS DATETIME) - 1) <= CAST(s2.ELIGEND AS DATETIME)
                   )
        --AND s1.DEMONSTRATION_POPLUATION = 'Adult Expansion'
        --AND s1.UMIC = '1'
GROUP BY s1.CLIENTID, CAST(s1.ELIGBEGIN AS DATETIME), s1.AGE, s1.ELIGBEGIN, s1.DEMONSTRATION_POPULATION
ORDER BY s1.CLIENTID, CAST(s1.ELIGBEGIN AS DATETIME)

--do the same for the 2020 calendar year:
SELECT * INTO #y2
FROM #t
WHERE YEAR(ELIGBEGIN) = '2020'

SELECT s1.CLIENTID
       , s1.AGE
       , CAST(s1.ELIGBEGIN AS DATETIME)    AS ELIGBEGIN
       , MIN(CAST(t1.ELIGEND AS DATETIME)) AS ELIGEND
       , ROW_NUMBER() OVER(PARTITION BY s1.CLIENTID ORDER BY CAST(s1.ELIGBEGIN AS DATETIME)) AS Sequence_ID
INTO #CY20
FROM --[dbo].[Eligs] 
 --#t s1
 #y2 s1
INNER JOIN #y2--#t
t1 ON t1.CLIENTID = s1.CLIENTID
                   AND CAST(s1.ELIGBEGIN AS DATETIME) <= CAST(t1.ELIGEND AS DATETIME)
                   AND s1.DEMONSTRATION_POPULATION = 'Adult Expansion'
    AND NOT EXISTS
    (
    SELECT * FROM #y2 t2--#t t2
                 WHERE t2.CLIENTID = t1.CLIENTID
                   AND (CAST(t1.ELIGEND AS DATETIME) + 1) >= CAST(t2.ELIGBEGIN AS DATETIME)
                         AND CAST(t1.ELIGEND AS DATETIME) < CAST(t2.ELIGEND AS DATETIME)
                   )
WHERE --s1.DEMONSTRATION_POPLUATION = 'Adult Expansion' 
    --AND 
    NOT EXISTS
    (
    SELECT * FROM #y2 s2--#t s2
                 WHERE s2.CLIENTID = s1.CLIENTID
                   AND CAST(s1.ELIGBEGIN AS DATETIME)       > CAST(s2.ELIGBEGIN AS DATETIME)
                   AND (CAST(s1.ELIGBEGIN AS DATETIME) - 1) <= CAST(s2.ELIGEND AS DATETIME)
                   )
        --AND s1.DEMONSTRATION_POPLUATION = 'Adult Expansion'
        --AND s1.UMIC = '1'
GROUP BY s1.CLIENTID, CAST(s1.ELIGBEGIN AS DATETIME), s1.AGE
--ORDER BY s1.CLIENTID, CAST(s1.ELIGBEGIN AS DATETIME)
;

SELECT CLIENTID
, AGE
, ELIGBEGIN
, ELIGEND
, Sequence_ID = ROW_NUMBER() OVER (PARTITION BY CLIENTID ORDER BY ELIGBEGIN)
INTO #testcase
FROM (
SELECT * FROM #CY19
UNION
SELECT * FROM #CY20) a

SELECT * FROM #testcase
ORDER BY CLIENTID

但是,最终的实际输出SELECT * FROM #testcase如下:

实际结果:

客户编号 年龄 开始 精英 序列号
123456789 52 2019-12-19 2019-12-31 1
123456789 52 2020-01-01 2020-01-15 2
123456789 52 2020-03-01 2020-12-31 3
020234587 36 2019-06-01 2019-06-30 1
020234587 36 2019-08-01 2019-11-30 2

如您所见,我面临三个主要问题,即实际输出与所需输出不匹配。

  1. 我必须在两个不同的年份运行相同的查询两次,因为我不知道如何为从 2019 年 12 月 31 日到 2020 年 1 月 31 日及以后连续注册的成员细分注册。如果我运行 code table #t而不是 table #y1,第一个 CLIENTID 的输出将是 12/16/19 到 1/15/20,这是我不想要的。
  2. 我完全失去了一个客户 ID,因为他们在“成人扩展”子群体中,然后在“示范群体 #3”中,然后又回到“成人扩展”中
  3. 出于某种原因,代码将正确忽略客户所在的前面的行,DEMONSTRATION_POPULATION != 'Adult Expansion'但如果 CLIENTID 从“成人扩展”更改为不同的子群体,则实际输出仍包括来自该不同子群体的注册日期,这直接针对我想要的输出。当个人注册“成人扩展”时,我只想要注册段。

如果有人能协助解决以上三个问题,我将非常感激!

提前感谢您的任何提示!

标签: sqlsql-servertsqlgaps-and-islands

解决方案


这是使用间隙和岛屿方法的解决方案:

;WITH prevNextCTE
AS
(
    SELECT CLIENTID,
           DEMONSTRATION_POPULATION,
           ELIGBEGIN,
           ELIGEND,
           AGE,
           -- compare the year of the previous record to the current to create split at year end
           CASE WHEN YEAR(LAG(ELIGEND,1) OVER (PARTITION BY CLIENTID ORDER BY ELIGBEGIN)) = YEAR(ELIGBEGIN)
                THEN LAG(ELIGEND,1) OVER (PARTITION BY CLIENTID ORDER BY ELIGBEGIN)
           END AS prevELIGEND,
           -- compare the year of the next record to the current to create split at year end
           CASE WHEN YEAR(LEAD(ELIGBEGIN,1) OVER (PARTITION BY CLIENTID ORDER BY ELIGBEGIN)) = YEAR(ELIGEND)
                THEN LEAD(ELIGBEGIN,1) OVER (PARTITION BY CLIENTID ORDER BY ELIGBEGIN)
           END AS nextELIGBEGIN
    FROM #t
    WHERE DEMONSTRATION_POPULATION = 'Adult Expansion'
)
,islandStartCTE
AS
(
    SELECT *,
           ROW_NUMBER() OVER (PARTITION BY CLIENTID ORDER BY ELIGBEGIN) AS sequence_no
    FROM prevNextCTE
    WHERE prevELIGEND IS NULL
    OR DATEADD(DAY,1,prevELIGEND) < ELIGBEGIN
)
,islandEndCTE
AS
(
    SELECT *,
           ROW_NUMBER() OVER (PARTITION BY CLIENTID ORDER BY ELIGBEGIN) AS sequence_no
    FROM prevNextCTE
    WHERE nextELIGBEGIN IS NULL
    OR DATEADD(DAY,1,ELIGEND) < nextELIGBEGIN
)
SELECT iss.CLIENTID,
       iss.AGE,
       iss.ELIGBEGIN,
       ise.ELIGEND,
       iss.sequence_no
FROM islandStartCTE AS iss
-- left join here is not necessary but makes it easier to spot errors 
-- when a start position has no matching end (which should not occur)
LEFT
JOIN islandEndCTE AS ise 
ON ise.CLIENTID = iss.CLIENTID
AND ise.sequence_no = iss.sequence_no

推荐阅读