python - 使用日期范围重构 pandas DataFrame
问题描述
我需要根据日期重叠重构 pandas DataFrame。为了更多地解释我的问题,让我向您展示一些示例:
sample_df = pd.DataFrame({'id':['AAA', 'AAA', 'AAA', 'AAA', 'AAA', 'AAA', 'BBB', 'BBB', 'CCC', 'CCC', 'DDD', 'DDD'],
's':[date(2000, 1, 12), date(2005, 2, 25), date(2005, 8, 30), date(2009, 4, 22), date(2015, 4, 22), date(2015, 5, 7), date(1972, 11, 4), date(2019, 6, 15), date(2000, 11, 4), date(1995, 1, 1), date(1997, 1, 1), date(1998, 1, 1)],
'e':[date(2000, 6, 22), date(2005, 8, 30), date(2006, 1, 1), date(2015, 4, 22), date(2015, 5, 7), date(2018, 9, 28), date(2019, 8, 1), date(2019, 12, 31), date(2005, 11, 4), date(2019, 1, 1), date(1997, 3, 1), date(1998, 3, 1)]})
id s e
AAA 2000-01-12 2000-06-22
AAA 2005-02-25 2005-08-30
AAA 2005-08-30 2006-01-01
AAA 2009-04-22 2015-04-22
AAA 2015-04-22 2015-05-07
AAA 2015-05-07 2018-09-28
BBB 1972-11-04 2019-08-01
BBB 2019-06-15 2019-12-31
CCC 2000-11-04 2005-11-04
CCC 1995-01-01 2019-01-01
DDD 1997-01-01 1997-03-01
DDD 1998-01-01 1998-03-01
我想获得一个 DataFrame,例如:
id s e
AAA 2000-01-12 2000-06-22 # row unchanged: no overlap with any other row with id=='AAA'
AAA 2005-02-25 2006-01-01 # 2 rows merged
AAA 2015-04-22 2018-09-28 # 3 rows merged
BBB 1972-11-04 2019-12-31
CCC 1995-01-01 2019-01-01
DDD 1997-01-01 1997-03-01
DDD 1998-01-01 1998-03-01 # row kept as data range does not intersect with the previous one
关于如何做到这一点的任何想法?希望你能帮忙。
问候
编辑
如果给定的日期范围id
不重叠,则不应合并行!
编辑 2为数据
添加更复杂DataFrame
的几个案例id=='AAA'
:
- 6 行
id=='AAA'
- 1 行必须保持原样,因为没有重叠
- 由于时间段重叠,必须合并 2 个组
- 1组两排
- 1组三排
解决方案
编辑2:(按时间倒序发布以避免浪费阅读时间和不必要的滚动)
努力避免 OP 的 EDIT 2 出现循环。暂时发布此内容。如果存在,很想知道更好的解决方案
from numpy import nan
from pandas import DataFrame as df
from datetime import date,timedelta
sample_df = df({'id':['AAA', 'AAA', 'AAA', 'AAA', 'AAA', 'AAA', 'BBB', 'BBB', 'CCC', 'CCC', 'DDD', 'DDD'],'s':[date(2000, 1, 12), date(2005, 2, 25), date(2005, 8, 30), date(2009, 4, 22), date(2015, 4, 22), date(2015, 5, 7), date(1972, 11, 4), date(2019, 6, 15), date(2000, 11, 4), date(1995, 1, 1), date(1997, 1, 1), date(1998, 1, 1)],'e':[date(2000, 6, 22), date(2005, 8, 30), date(2006, 1, 1), date(2015, 4, 22), date(2015, 5, 7), date(2018, 9, 28), date(2019, 8, 1), date(2019, 12, 31), date(2005, 11, 4), date(2019, 1, 1), date(1997, 3, 1), date(1998, 3, 1)]})
def grouped_flag(row_in):
labels = len(row_in)
row_in['label'] = nan
for ind,row in row_in.iterrows():
if not row['overlap']:
row_in.loc[row.name,'label'] = labels
labels = labels - 1
else:
row_in.loc[row.name,'label'] = labels
row_in['label'] = row_in['label'].astype(str)
result = row_in.groupby(['id','label']).agg({'s':min,'e':max}).reset_index().drop(['label'],axis=1)
return result
sample_df['overlap'] = (sample_df.groupby('id').apply(lambda x: (x['e'] - x['s'].shift(-1)) >= timedelta(0)).reset_index(level=0, drop=True))
out_df = sample_df.groupby(['id']).apply(grouped_flag).reset_index(drop=True)
print(out_df)
如果您可以不做一步,我认为这是可行的。
编辑 1:对代码的清洁表示歉意。写这个只是为了解决方案。复制时,在开始和结束日期分别执行每个 lambda 函数,或者更好的是,只需编写一个单独的函数并在 apply 中调用它。那绝对是整洁的。
>>> from datetime import date
>>> from pandas import DataFrame as df, concat
>>> tbl = df({'id':['AAA', 'AAA', 'BBB', 'BBB', 'CCC', 'CCC'],
... 's':[date(2015, 4, 22), date(2015, 5, 7), date(1972, 11, 4), date(2019, 6, 15), date(2000, 11, 4), date(1995, 1, 1)],
... 'e':[date(2015, 5, 7), date(2018, 9, 28), date(2019, 8, 1), date(2019, 12, 31), date(2005, 11, 4), date(2019, 1, 1)]})
## using a lambda function on the start and end date to get the min start date and max end date for each unique value in the ID column, and using concat so I can assign it to both columns simultaneously. You can do them separately in the case that it's part of a script for production/high performance.
>>> tbl[['s_mod','e_mod']] = concat([tbl['id'].apply(lambda x: tbl.loc[tbl['id']==x,'s'].min()),tbl['id'].apply(lambda x: tbl.loc[tbl['id']==x,'e'].max())],axis=1,keys=['s_mod','e_mod'])
## Since each value of id has only one unique s_mod and e_mod, subsetting the df and dropping duplicates to get the desired result
>>> tbl = tbl[['id','s_mod','e_mod']].drop_duplicates()
>>> tbl
id s_mod e_mod
0 AAA 2015-04-22 2018-09-28
2 BBB 1972-11-04 2019-12-31
4 CCC 1995-01-01 2019-01-01
>>>
EDIT 1: To account for non-overlapping dates the solution below seems to work.
from pandas import DataFrame as df, concat
from datetime import date
from numpy import nan
sample_df = df({'id':['AAA', 'AAA', 'BBB', 'BBB', 'CCC', 'CCC', 'DDD', 'DDD'],
's':[date(2015, 4, 22), date(2015, 5, 7), date(1972, 11, 4), date(2019, 6, 15), date(2000, 11, 4), date(1995, 1, 1), date(1997, 1, 1), date(1998, 1, 1)],
'e':[date(2015, 5, 7), date(2018, 9, 28), date(2019, 8, 1), date(2019, 12, 31), date(2005, 11, 4), date(2019, 1, 1), date(1997, 3, 1), date(1998, 3, 1)]})
sample_df['s_mod'] = sample_df['id'].apply(
lambda x: sample_df['s'].loc[sample_df['id']==x]
if len(sample_df.loc[sample_df['id']==x,]) == 1 # IN THE CASE THAT THERE IS ONLY ONE RECORD OF 'id' CONTAINING 'x'
else sample_df.loc[sample_df['id']==x,]['s'].min()
if sample_df.loc[sample_df['id']==x,]['e'].min() # MIN END DATE OF x
>= sample_df.loc[sample_df['id']==x,]['s'].max() # <-- MAX START DATE of x | IF OVERLAPPING ASSIGN THE MIN VALUE OF 's'
else nan) # ELSE ASSIGN A nan
# SAME THING APPLIED FOR END DATE
sample_df['e_mod'] = sample_df['id'].apply(lambda x: sample_df['e'].loc[sample_df['id']==x] if len(sample_df.loc[sample_df['id']==x,]) == 1 else sample_df.loc[sample_df['id']==x,]['e'].max() if sample_df.loc[sample_df['id']==x,]['e'].min() >= sample_df.loc[sample_df['id']==x,]['s'].max() else nan)
sample_df
index id s e s_mod e_mod
0 2 BBB 1972-11-04 2019-08-01 1972-11-04 2019-12-31
1 5 CCC 1995-01-01 2019-01-01 1995-01-01 2019-01-01
2 6 DDD 1997-01-01 1997-03-01 NaN NaN
3 7 DDD 1998-01-01 1998-03-01 NaN NaN
4 4 CCC 2000-11-04 2005-11-04 1995-01-01 2019-01-01
5 0 AAA 2015-04-22 2015-05-07 2015-04-22 2018-09-28
6 1 AAA 2015-05-07 2018-09-28 2015-04-22 2018-09-28
7 3 BBB 2019-06-15 2019-12-31 1972-11-04 2019-12-31
# FILLING THE nan WITH THE ORIGIN VALUES SO THAT WE MAY RETAIN THE ORIGINAL RECORDS
sample_df['s_mod'] = sample_df['s_mod'].fillna(sample_df['s'])
sample_df['e_mod'] = sample_df['e_mod'].fillna(sample_df['e'])
sample_df = sample_df[['id','s_mod','e_mod']].drop_duplicates()
sample_df
id s_mod e_mod
0 BBB 1972-11-04 2019-12-31
1 CCC 1995-01-01 2019-01-01
2 DDD 1997-01-01 1997-03-01
3 DDD 1998-01-01 1998-03-01
5 AAA 2015-04-22 2018-09-28
推荐阅读
- c# - 列出方法或属性的键盘快捷键
- javascript - 使用 gulp 时出现 AssertionError。'必须指定任务功能'
- matlab - `eye(2)` 和 `[1, 0; 0, 1]` 在自动广播中
- android - 使用 ADB 创建新联系人
- python - 生成随机行并在观察之间保持顺序以在 python 中创建数据框
- java - Servlet 注释到 xml
- android - 无法访问类 CoroutineScope。检查模块类路径
- python - 如何在 Django 中创建一个分为两部分的注册表单,以便一个呼叫仅在电子邮件验证后填写第二部分?
- tabulator - 如何防止分组空字段
- typescript - 函数解析对象的 Promise 值的正确类型:{[key: string}: Promise
=> {[键:字符串]:T}