python - 在其他特定事件之前计算 DataFrame 中的特定事件
问题描述
sample = pd.DataFrame({"event_time" :['2019-10-01 00:04:47 UTC','2019-10-01 00:04:49 UTC',
'2019-10-01 00:04:51 UTC','2019-10-01 00:04:53 UTC',
'2019-10-01 00:04:54 UTC','2019-10-01 00:04:56 UTC',
'2019-10-01 00:04:58 UTC','2019-10-01 00:05:01 UTC',
'2019-10-01 00:05:03 UTC','2019-10-01 00:05:03 UTC',
'2019-10-01 00:05:06 UTC','2019-10-01 00:05:07 UTC',
'2019-10-01 00:05:10 UTC','2019-10-01 00:05:10 UTC',
'2019-10-01 00:05:13 UTC','2019-10-01 00:05:13 UTC',
'2019-10-01 00:05:14 UTC','2019-10-01 00:05:15 UTC',
'2019-10-01 00:05:16 UTC','2019-10-01 00:05:17 UTC'],
"event_type": ['view', 'view', 'view', 'view','view','view','view','view',
'view','view','view','view','view','view','view','view',
'cart','view','view','view'],
"product_id": [5100816, 10800048, 15700176,17300555,12703015,2601543,
1004966,1005115,1004284,4300285,26204088,13400134,19000265,
1002532,2700609,26400672,5100816,4300262,28101002,26300087],
"category_id": [2053013553375346967,2053013554994348409,2053013559733912211,
2053013553853497655,2053013553559896355,2053013563970159485,
2053013555631882655,2053013555631882655,2053013555631882655,
2053013552385491165,2053013563693335403,2053013557066334713,
2053013557225718275,2053013555631882655,2053013563911439225,
2053013563651392361,2053013553375346967,2053013552385491165,
2053013564918072245,2053013563584283495],
"user_id": [550121407, 539194858, 552373753, 520002471, 555448033, 513173999,
523208885, 529755884, 513840435, 544648245, 515547943, 552795065,
525734504, 551377651, 502372361, 555447748, 550121407, 523239174,
537858238, 555448236],
})
event_time event_type product_id category_id user_id
0 2019-10-01 00:04:47 UTC view 5100816 2053013553375346967 550121407
1 2019-10-01 00:04:49 UTC view 10800048 2053013554994348409 539194858
2 2019-10-01 00:04:51 UTC view 15700176 2053013559733912211 552373753
3 2019-10-01 00:04:53 UTC view 17300555 2053013553853497655 520002471
4 2019-10-01 00:04:54 UTC view 12703015 2053013553559896355 555448033
5 2019-10-01 00:04:56 UTC view 2601543 2053013563970159485 513173999
6 2019-10-01 00:04:58 UTC view 1004966 2053013555631882655 523208885
7 2019-10-01 00:05:01 UTC view 1005115 2053013555631882655 529755884
8 2019-10-01 00:05:03 UTC view 1004284 2053013555631882655 513840435
9 2019-10-01 00:05:03 UTC view 4300285 2053013552385491165 544648245
10 2019-10-01 00:05:06 UTC view 26204088 2053013563693335403 515547943
11 2019-10-01 00:05:07 UTC view 13400134 2053013557066334713 552795065
12 2019-10-01 00:05:10 UTC view 19000265 2053013557225718275 525734504
13 2019-10-01 00:05:10 UTC view 1002532 2053013555631882655 551377651
14 2019-10-01 00:05:13 UTC view 2700609 2053013563911439225 502372361
15 2019-10-01 00:05:13 UTC view 26400672 2053013563651392361 555447748
16 2019-10-01 00:05:14 UTC cart 5100816 2053013553375346967 550121407
17 2019-10-01 00:05:15 UTC view 4300262 2053013552385491165 523239174
18 2019-10-01 00:05:16 UTC view 28101002 2053013564918072245 537858238
19 2019-10-01 00:05:17 UTC view 26300087 2053013563584283495 555448236
我有上面的 DataFrame,总共大约 14GB。我需要做的是计算用户在将产品添加到购物车之前“查看”了多少次。问题是,很多时候用户在将产品添加到购物车甚至购买之后也会“查看”产品。这意味着我不能简单地将它们排除在外。我试过这段代码:
products = list(data.loc[data["event_type"] == "cart"]["product_id"])
users = list(data.loc[data["event_type"] == "cart"]["user_id"])
idx_carting = list(data.loc[data["event_type"] == "cart"].index)
views_count = []
for i in range(len(idx_carting)):
sub_df = data.iloc[:idx_carting[i]]
views_count.append(len(sub_df.loc[(sub_df["product_id"] == products[i])
& (sub_df["user_id"]==users[i])]))
但是它太重了,我让它运行了2个多小时,它仍然没有完成任务。有人有更有效的方法吗?
解决方案
您最好选择您实际想要查看的数据元素,按这些产品和用户 ID 分组,然后获取计数。以下仅计算首次购买前的观看次数。因此,根本不计算同一物品的多次购买。
我添加了一行额外的数据来显示首次购买后的查看事件不计算在内。
import pandas as pd
sample = pd.DataFrame({"event_time" :['2019-10-01 00:04:47 UTC','2019-10-01 00:04:49 UTC',
'2019-10-01 00:04:51 UTC','2019-10-01 00:04:53 UTC',
'2019-10-01 00:04:54 UTC','2019-10-01 00:04:56 UTC',
'2019-10-01 00:04:58 UTC','2019-10-01 00:05:01 UTC',
'2019-10-01 00:05:03 UTC','2019-10-01 00:05:03 UTC',
'2019-10-01 00:05:06 UTC','2019-10-01 00:05:07 UTC',
'2019-10-01 00:05:10 UTC','2019-10-01 00:05:10 UTC',
'2019-10-01 00:05:13 UTC','2019-10-01 00:05:13 UTC',
'2019-10-01 00:05:14 UTC','2019-10-01 00:05:15 UTC',
'2019-10-01 00:05:16 UTC','2019-10-01 00:05:17 UTC',
'2019-10-01 00:05:19 UTC'],
"event_type": ['view', 'view', 'view', 'view','view','view','view','view',
'view','view','view','view','view','view','view','view',
'cart','view','view','view', 'view'],
"product_id": [5100816, 10800048, 15700176,17300555,12703015,2601543,
1004966,1005115,1004284,4300285,26204088,13400134,19000265,
1002532,2700609,26400672,5100816,4300262,28101002,26300087,
5100816],
"user_id": [550121407, 539194858, 552373753, 520002471, 555448033, 513173999,
523208885, 529755884, 513840435, 544648245, 515547943, 552795065,
525734504, 551377651, 502372361, 555447748, 550121407, 523239174,
537858238, 555448236, 550121407]
下面我们使用两个groupby
共享相同索引的对象。for 循环仍然很慢,但会比您当前的方法更快。
# get the list of product ids that are actually added to a cart
cart_products = sample.query('event_type=="cart"').product_id
# create groupings for each cart event for a user and product
# this is used to get the first cart event
gb_c = sample.query('event_type=="cart"').groupby(['user_id', 'product_id'])
# create a grouping of view events for each user and product where a
# cart event also occurs in the data
gb_v = (
sample[sample.product_id.isin(cart_products)]
.query('event_type=="view"')
.groupby(['user_id', 'product_id'])
)
# iterate over the groups, pull out the matching group, get the counts
counts = []
indices = []
for ix, g1 in gb_c:
try:
g2 = gb_v.get_group(ix)
except KeyError:
# handle rare case of an cart event with no view event
continue
counts.append(g2.loc[g2.event_time.lt(g1.event_time.iloc[0]), 'event_type'].count() )
indices.append(ix)
# form a new data frame
df = pd.DataFrame(
{'counts': counts},
index=pd.MultiIndex.from_tuples(indices, names=['user_id', 'product_id'])
例如,这是一个有 5000 万行的数据框,在我的笔记本电脑上不到 2 分钟就可以完成。
import pandas as pd
import numpy as np
s = 50_000_000
df = pd.DataFrame({
'event_type': np.random.choice(['view', 'cart'], size=s, p=(0.999, 0.001)),
'product_id': np.random.randint(100000, 200000, size=s),
'user_id': np.random.randint(100000, 150000, size=s)
},
index=pd.date_range('2020-01-01', periods=s, freq='mS', name='event_time')).reset_index()
cart_products = df.query('event_type=="cart"').product_id
gb_c = df.query('event_type=="cart"').groupby(['user_id', 'product_id'])
gb_v = (
df[df.product_id.isin(cart_products)]
.query('event_type=="view"')
.groupby(['user_id', 'product_id'])
)
counts = []
indices = []
for ix, g1 in gb_c:
try:
g2 = gb_v.get_group(ix)
except KeyError:
continue
counts.append(g2.loc[g2.event_time.lt(g1.event_time.iloc[0]), 'event_type'].count() )
indices.append(ix)
pd.DataFrame({'counts': counts},
index=pd.MultiIndex.from_tuples(indices, names=['user_id', 'product_id']))
推荐阅读
- r - 如何进行 10-最近邻倾向得分匹配?我似乎只在网上找到了 1-1 个最近邻匹配
- vue.js - 从非 vue 文件访问 vue 3 应用实例 api
- python-3.x - Excel - 工作表到 Parquet
- .net - 在远程机器上卸载 .net core sdk 1.x
- javascript - 在反应路线之间共享数据
- arrays - 使用指针跳过 C 中的空格
- r - 使用 igraph 在 R 中的网络图上不显示特征
- wordpress - Azure ACI Wordpress 模板建立数据库连接时出错
- android - Flutter 在调试模式下比发布时更快
- pandas - Pandas CSV 文件可视化 | 推文发帖频率