python - Pandas:第一次满足条件后如何删除组的所有后续行?
问题描述
在变量日期第一次变为 4 后,我试图删除数据框中每个组的后续行。
df = pd.DataFrame({"date": [1,2,3,3,4,1,2,3,3,4,1,1,1,4,4,4,1,1,1,2,2,3,3,3,4,4],
"variable": ["A", "A", "A","A","A","A", "A", "A","A","A", "B", "B", "B","B","B","B" ,"C", "C", "C","C", "D","D","D","D","D","D"],
"no": [1, 2.2, 3.5, 1.5, 1.5,1, 2.2, 3.5, 1.5, 1.5, 1.2, 1.3, 1.1, 2, 3,1, 2.2, 3.5, 1.5, 1.5, 1.2, 1.3, 1.1, 2, 3,9],
"value": [0.469112, -0.282863, -1.509059, -1.135632, 1.212112,0.469112, -0.282863, -1.509059, -1.135632, 1.212112, -0.173215,
0.119209, -1.044236, -0.861849, None,0.469112, -0.282863, -1.509059, -1.135632, 1.212112, -0.173215,
0.119209, -1.044236, -0.861849, None,0.87]})
date variable no value
0 1 A 1.0 0.469112
1 2 A 2.2 -0.282863
2 3 A 3.5 -1.509059
3 3 A 1.5 -1.135632
4 4 A 1.5 1.212112
5 1 A 1.0 0.469112
6 2 A 2.2 -0.282863
7 3 A 3.5 -1.509059
8 3 A 1.5 -1.135632
9 4 A 1.5 1.212112
10 1 B 1.2 -0.173215
11 1 B 1.3 0.119209
12 1 B 1.1 -1.044236
13 4 B 2.0 -0.861849
14 4 B 3.0 NaN
15 4 B 1.0 0.469112
16 1 C 2.2 -0.282863
17 1 C 3.5 -1.509059
18 1 C 1.5 -1.135632
19 2 C 1.5 1.212112
20 2 D 1.2 -0.173215
21 3 D 1.3 0.119209
22 3 D 1.1 -1.044236
23 3 D 2.0 -0.861849
24 4 D 3.0 NaN
25 4 D 9.0 0.870000
到目前为止,我已经尝试过这些方法:
def tail_test(group):
group = group[~(group.date.eq(4) | group.date.shift().eq(4))]
return group
df_sub = df.groupby('variable').apply(tail_test).reset_index(drop=True)
输出:
date variable no value
0 1 A 1.0 0.469112
1 2 A 2.2 -0.282863
2 3 A 3.5 -1.509059
3 3 A 1.5 -1.135632
4 2 A 2.2 -0.282863
5 3 A 3.5 -1.509059
6 3 A 1.5 -1.135632
7 1 B 1.2 -0.173215
8 1 B 1.3 0.119209
9 1 B 1.1 -1.044236
10 1 C 2.2 -0.282863
11 1 C 3.5 -1.509059
12 1 C 1.5 -1.135632
13 2 C 1.5 1.212112
14 2 D 1.2 -0.173215
15 3 D 1.3 0.119209
16 3 D 1.1 -1.044236
17 3 D 2.0 -0.861849
基本上,它会删除所有 4,但不是所有后续值
我尝试的下一个方法是:
def f(df):
mask = (df.date == 4).cumsum() <= 1
return df[mask]
df_sub = df.groupby("variable").apply(f)
输出是:
date variable no value
0 1 A 1.0 0.469112
1 2 A 2.2 -0.282863
2 3 A 3.5 -1.509059
3 3 A 1.5 -1.135632
4 4 A 1.5 1.212112
5 1 A 1.0 0.469112
6 2 A 2.2 -0.282863
7 3 A 3.5 -1.509059
8 3 A 1.5 -1.135632
9 1 B 1.2 -0.173215
10 1 B 1.3 0.119209
11 1 B 1.1 -1.044236
12 4 B 2.0 -0.861849
13 1 C 2.2 -0.282863
14 1 C 3.5 -1.509059
15 1 C 1.5 -1.135632
16 2 C 1.5 1.212112
17 2 D 1.2 -0.173215
18 3 D 1.3 0.119209
19 3 D 1.1 -1.044236
20 3 D 2.0 -0.861849
21 4 D 3.0 NaN
我可能犯了一些我无法弄清楚的愚蠢错误。请帮忙!
解决方案
IIUC,可以groupby
和用于cumprod
检测何时开始过滤
df[df.groupby('variable').date.transform(lambda s: s.ne(4).cumprod().astype(bool))]
date variable no value
0 1 A 1.0 0.469112
1 2 A 2.2 -0.282863
2 3 A 3.5 -1.509059
3 3 A 1.5 -1.135632
10 1 B 1.2 -0.173215
11 1 B 1.3 0.119209
12 1 B 1.1 -1.044236
16 1 C 2.2 -0.282863
17 1 C 3.5 -1.509059
18 1 C 1.5 -1.135632
19 2 C 1.5 1.212112
20 2 D 1.2 -0.173215
21 3 D 1.3 0.119209
22 3 D 1.1 -1.044236
23 3 D 2.0 -0.861849
推荐阅读
- dart - 颤振放大
- spring-batch - ETL Spring批处理、Spring云数据流(SCDF)
- flutter - Android 依赖 'com.android.support:support-v4' 的编译(27.0.1)和运行时(27.1.1)类路径有不同的版本
- c++ - 如何从字符串创建固定的 QDateTime?
- java - 如何映射列表中的 JSON 单个项目
在一些传入的 JSON 中有一个列表
"age" : 27, "country", USA, "fields": [ { "id": 261762251, "value": "Fred" }, { "id": 2615161
- javascript - ASP.net MVC bundle minficiation 重复变量名冲突
- python - 如何从元组列表中删除项目
- spring - Spring security:仅将过滤器应用于端点
- tabulator - 如何在制表符中取消编辑
- docker - 无法从容器访问远程 oracle 数据库