首页 > 解决方案 > 使用日期范围重构 pandas DataFrame

问题描述

我需要根据日期重叠重构 pandas DataFrame。为了更多地解释我的问题,让我向您展示一些示例:

sample_df = pd.DataFrame({'id':['AAA', 'AAA', 'AAA', 'AAA', 'AAA', 'AAA', 'BBB', 'BBB', 'CCC', 'CCC', 'DDD', 'DDD'],
                          's':[date(2000, 1, 12), date(2005, 2, 25), date(2005, 8, 30), date(2009, 4, 22), date(2015, 4, 22), date(2015, 5, 7), date(1972, 11, 4), date(2019, 6, 15), date(2000, 11, 4), date(1995, 1, 1), date(1997, 1, 1), date(1998, 1, 1)],
                          'e':[date(2000, 6, 22), date(2005, 8, 30), date(2006, 1, 1), date(2015, 4, 22), date(2015, 5, 7), date(2018, 9, 28), date(2019, 8, 1), date(2019, 12, 31), date(2005, 11, 4), date(2019, 1, 1), date(1997, 3, 1), date(1998, 3, 1)]})

id      s           e
AAA     2000-01-12  2000-06-22
AAA     2005-02-25  2005-08-30
AAA     2005-08-30  2006-01-01
AAA     2009-04-22  2015-04-22
AAA     2015-04-22  2015-05-07
AAA     2015-05-07  2018-09-28
BBB     1972-11-04  2019-08-01
BBB     2019-06-15  2019-12-31
CCC     2000-11-04  2005-11-04
CCC     1995-01-01  2019-01-01
DDD     1997-01-01  1997-03-01
DDD     1998-01-01  1998-03-01

我想获得一个 DataFrame,例如:

id      s           e
AAA     2000-01-12  2000-06-22 # row unchanged: no overlap with any other row with id=='AAA'
AAA     2005-02-25  2006-01-01 # 2 rows merged
AAA     2015-04-22  2018-09-28 # 3 rows merged
BBB     1972-11-04  2019-12-31
CCC     1995-01-01  2019-01-01
DDD     1997-01-01  1997-03-01
DDD     1998-01-01  1998-03-01 # row kept as data range does not intersect with the previous one

关于如何做到这一点的任何想法?希望你能帮忙。

问候

编辑 如果给定的日期范围id不重叠,则不应合并行!

编辑 2为数据 添加更复杂DataFrame的几个案例id=='AAA'

标签: pythonpandasdatetime

解决方案


编辑2:(按时间倒序发布以避免浪费阅读时间和不必要的滚动)

努力避免 OP 的 EDIT 2 出现循环。暂时发布此内容。如果存在,很想知道更好的解决方案

from numpy import nan
from pandas import DataFrame as df
from datetime import date,timedelta


sample_df = df({'id':['AAA', 'AAA', 'AAA', 'AAA', 'AAA', 'AAA', 'BBB', 'BBB', 'CCC', 'CCC', 'DDD', 'DDD'],'s':[date(2000, 1, 12), date(2005, 2, 25), date(2005, 8, 30), date(2009, 4, 22), date(2015, 4, 22), date(2015, 5, 7), date(1972, 11, 4), date(2019, 6, 15), date(2000, 11, 4), date(1995, 1, 1), date(1997, 1, 1), date(1998, 1, 1)],'e':[date(2000, 6, 22), date(2005, 8, 30), date(2006, 1, 1), date(2015, 4, 22), date(2015, 5, 7), date(2018, 9, 28), date(2019, 8, 1), date(2019, 12, 31), date(2005, 11, 4), date(2019, 1, 1), date(1997, 3, 1), date(1998, 3, 1)]})

def grouped_flag(row_in):
    labels = len(row_in)
    row_in['label'] = nan
    for ind,row in row_in.iterrows():
        if not row['overlap']:
            row_in.loc[row.name,'label'] = labels
            labels = labels - 1
        else:
            row_in.loc[row.name,'label'] = labels

    row_in['label'] = row_in['label'].astype(str)
    result = row_in.groupby(['id','label']).agg({'s':min,'e':max}).reset_index().drop(['label'],axis=1)


    return result


sample_df['overlap'] = (sample_df.groupby('id').apply(lambda x: (x['e'] - x['s'].shift(-1)) >= timedelta(0)).reset_index(level=0, drop=True))
out_df = sample_df.groupby(['id']).apply(grouped_flag).reset_index(drop=True)
print(out_df)

如果您可以不做一步,我认为这是可行的。

编辑 1:对代码的清洁表示歉意。写这个只是为了解决方案。复制时,在开始和结束日期分别执行每个 lambda 函数,或者更好的是,只需编写一个单独的函数并在 apply 中调用它。那绝对是整洁的。

>>> from datetime import date
>>> from pandas import DataFrame as df, concat
>>> tbl = df({'id':['AAA', 'AAA', 'BBB', 'BBB', 'CCC', 'CCC'],
...                           's':[date(2015, 4, 22), date(2015, 5, 7), date(1972, 11, 4), date(2019, 6, 15), date(2000, 11, 4), date(1995, 1, 1)],
...                           'e':[date(2015, 5, 7), date(2018, 9, 28), date(2019, 8, 1), date(2019, 12, 31), date(2005, 11, 4), date(2019, 1, 1)]})

## using a lambda function on the start and end date to get the min start date and max end date for each unique value in the ID column, and using concat so I can assign it to both columns simultaneously. You can do them separately in the case that it's part of a script for production/high performance.
>>> tbl[['s_mod','e_mod']] = concat([tbl['id'].apply(lambda x: tbl.loc[tbl['id']==x,'s'].min()),tbl['id'].apply(lambda x: tbl.loc[tbl['id']==x,'e'].max())],axis=1,keys=['s_mod','e_mod'])

## Since each value of id has only one unique s_mod and e_mod, subsetting the df and dropping duplicates to get the desired result
>>> tbl = tbl[['id','s_mod','e_mod']].drop_duplicates()
>>> tbl
    id       s_mod       e_mod
0  AAA  2015-04-22  2018-09-28
2  BBB  1972-11-04  2019-12-31
4  CCC  1995-01-01  2019-01-01
>>> 

EDIT 1: To account for non-overlapping dates the solution below seems to work. 

from pandas import DataFrame as df, concat
from datetime import date
from numpy import nan

sample_df = df({'id':['AAA', 'AAA', 'BBB', 'BBB', 'CCC', 'CCC', 'DDD', 'DDD'],
                          's':[date(2015, 4, 22), date(2015, 5, 7), date(1972, 11, 4), date(2019, 6, 15), date(2000, 11, 4), date(1995, 1, 1), date(1997, 1, 1), date(1998, 1, 1)],
                          'e':[date(2015, 5, 7), date(2018, 9, 28), date(2019, 8, 1), date(2019, 12, 31), date(2005, 11, 4), date(2019, 1, 1), date(1997, 3, 1), date(1998, 3, 1)]})

sample_df['s_mod'] = sample_df['id'].apply(
          lambda x: sample_df['s'].loc[sample_df['id']==x] 
           if len(sample_df.loc[sample_df['id']==x,]) == 1 # IN THE CASE THAT THERE IS ONLY ONE RECORD OF 'id' CONTAINING 'x'
           else sample_df.loc[sample_df['id']==x,]['s'].min() 
           if sample_df.loc[sample_df['id']==x,]['e'].min() # MIN END DATE OF x 
>= sample_df.loc[sample_df['id']==x,]['s'].max() # <-- MAX START DATE of x | IF OVERLAPPING ASSIGN THE MIN VALUE OF 's'
           else nan) # ELSE ASSIGN A nan
# SAME THING APPLIED FOR END DATE
sample_df['e_mod'] = sample_df['id'].apply(lambda x: sample_df['e'].loc[sample_df['id']==x] if len(sample_df.loc[sample_df['id']==x,]) == 1 else sample_df.loc[sample_df['id']==x,]['e'].max() if sample_df.loc[sample_df['id']==x,]['e'].min() >= sample_df.loc[sample_df['id']==x,]['s'].max() else nan)

sample_df
   index   id           s           e       s_mod       e_mod
0      2  BBB  1972-11-04  2019-08-01  1972-11-04  2019-12-31
1      5  CCC  1995-01-01  2019-01-01  1995-01-01  2019-01-01
2      6  DDD  1997-01-01  1997-03-01         NaN         NaN
3      7  DDD  1998-01-01  1998-03-01         NaN         NaN
4      4  CCC  2000-11-04  2005-11-04  1995-01-01  2019-01-01
5      0  AAA  2015-04-22  2015-05-07  2015-04-22  2018-09-28
6      1  AAA  2015-05-07  2018-09-28  2015-04-22  2018-09-28
7      3  BBB  2019-06-15  2019-12-31  1972-11-04  2019-12-31

# FILLING THE nan WITH THE ORIGIN VALUES SO THAT WE MAY RETAIN THE ORIGINAL RECORDS
sample_df['s_mod'] = sample_df['s_mod'].fillna(sample_df['s'])
sample_df['e_mod'] = sample_df['e_mod'].fillna(sample_df['e'])
sample_df = sample_df[['id','s_mod','e_mod']].drop_duplicates()
sample_df
    id       s_mod       e_mod
0  BBB  1972-11-04  2019-12-31
1  CCC  1995-01-01  2019-01-01
2  DDD  1997-01-01  1997-03-01
3  DDD  1998-01-01  1998-03-01
5  AAA  2015-04-22  2018-09-28


推荐阅读