python - 使用多个 if-else 子句对 pandas 数据帧进行矢量化以拆分域
问题描述
请帮助使以下熊猫数据帧代码矢量化/更快,它非常慢。
我有下面的代码,它完全符合我的要求。它需要具有大量子域的域并将它们规范化为仅主机名 + TLD。
我找不到任何使用if-else
语句的向量化示例。
import pandas as pd
import time
#import file into dataframe
start = time.time()
path = "Desktop/dom1.csv"
df = pd.read_csv(path, delimiter=',', header='infer', encoding = "ISO-8859-1")
#strip out all ---- values
df2 = df[((df['domain'] != '----'))]
#extract only 2 columns from dataframe
df3 = df2[['domain', 'web.optimisedsize']]
#define tld and cdn lookup lists
tld = ['co.uk', 'com', 'org', 'gov.uk', 'co', 'net', 'news', 'it', 'in' 'es', 'tw', 'pe', 'io', 'ca', 'cat', 'com.au',
'com.ar', 'com.mt', 'com.co', 'ws', 'to', 'es', 'de', 'us', 'br', 'im', 'gr', 'cc', 'cn', 'org.uk', 'me', 'ovh', 'be',
'tv', 'tech', '..', 'life', 'com.mx', 'pl', 'uk', 'ru', 'cz', 'st', 'info', 'mobi', 'today', 'eu', 'fi', 'jp', 'life',
'1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'earth', 'ninja', 'ie', 'im', 'ai', 'at', 'ch', 'ly', 'market', 'click',
'fr', 'nl', 'se']
cdns = ['akamai', 'maxcdn', 'cloudflare']
#iterate through each row of the datafrme and split each domain at the dot
for row in df2.itertuples():
index = df3.domain.str.split('.').tolist()
cleandomain = []
#iterate through each of the split domains
for x in index:
#if it isn't a string, then print the value directly in the cleandomain list
if not isinstance(x, str):
cleandomain.append(str(x))
#if it's a string that encapsulates numbers, then it's an IP
elif str(x)[-1].isnumeric():
try:
cleandomain.append(str(x[0])+'.'+str(x[1])+'.*.*')
except IndexError:
cleandomain.append(str(x))
#if its in the CDN list, take a subdomain as well
elif len(x) > 3 and str(x[len(x)-2]).rstrip() in cdns:
try:
cleandomain.append(str(x[len(x)-3])+'.'+str(x[len(x)-2])+'.'+str(x[len(x)-1]))
except IndexError:
cleandomain.append(str(x))
elif len(x) > 3 and str(x[len(x)-3]).rstrip() in cdns:
try:
cleandomain.append(str(x[len(x)-4])+'.'+str(x[len(x)-3])+'.'+str(x[len(x)-2])+'.'+ str(x[len(x)-1]))
except IndexError:
cleandomain.append(str(x))
#if its in the TLD list, do this
elif len(x) > 2 and str(x[len(x)-2]).rstrip()+'.'+ str(x[len(x)-1]).rstrip() in tld:
try:
cleandomain.append(str(x[len(x)-3])+'.'+str(x[len(x)-2])+'.'+ str(x[len(x)-1]))
except IndexError:
cleandomain.append(str(x))
elif len(x) > 2 and str(x[len(x)-1]) in tld:
try:
cleandomain.append(str(x[len(x)-2])+'.'+ str(x[len(x)-1]))
except IndexError:
cleandomain.append(str(x))
#if its not in the TLD list, do this
else:
cleandomain.append(str(x))
#add the column to the dataframe
df3['newdomain2']=cleandomain
se = pd.Series(cleandomain)
df3['newdomain2'] = se.values
#select only the new domain column & usage
df4 = df3[['newdomain2', 'web.optimisedsize']]
#group by
df5 = df4.groupby(['newdomain2'])[['web.optimisedsize']].sum()
#sort
df6 = df5.sort_values(['web.optimisedsize'], ascending=["true"])
end = time.time()
print(df6)
print(end-start)
我的输入是这个 DF:
In [4]: df
Out[4]:
Domain Use
0 graph.facebook.com 4242
1 news.bbc.co.uk 23423
2 news.more.news.bbc.co.uk 234432
3 profile.username.co 235523
4 offers.o2.co.uk 235523
5 subdomain.pyspark.org 2325
6 uds.data.domain.net 23523
7 domain.akamai.net 23532
8 333.333.333.333 3432324
在此期间,索引将其拆分为:
[['graph', 'facebook', 'com'], ['news', 'bbc' .....
然后,我将新域作为新列附加到原始数据框中。然后将其按 + summed 分组以创建最终数据帧。
In [10]: df
Out[10]:
Domain Use newdomain
0 graph.facebook.com 4242 facebook.com
1 news.bbc.co.uk 23423 bbc.co.uk
2 news.more.news.bbc.co.uk 234432 bbc.co.uk
3 profile.username.co 235523 username.co
解决方案
其中一个问题是,在您执行的每次迭代中,您都有index = df3.domain.str.split('.').tolist()
. 当我把这条线放在循环之外时,计算速度快了 2 倍。587 毫秒 VS 1.1 秒。
我也认为你的代码是错误的。您不使用row
变量index
而是使用。当您迭代索引时,一个元素始终是一个列表。if not isinstance(x, str)
总是如此。(您可以在下面的 line_debugger 输出中看到它)
字符串操作通常不可向量化。甚至.str
符号实际上也是一个 python 循环。
这是 Jupyter notebook 中 line_debugger 工具的输出:初始化(f 是一个包裹在代码周围的函数):
%load_ext line_profiler
%lprun -f f f(df2, df3)
输出:
Total time: 1.82219 s
File: <ipython-input-8-79f01a353d31>
Function: f at line 1
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1 def f(df2,df3):
2 1 8093.0 8093.0 0.2 index = df3.Domain.str.split('.').tolist()
3 #iterate through each row of the datafrme and split each domain at the dot
4 901 11775.0 13.1 0.2 for row in df2.itertuples():
5
6 900 26241.0 29.2 0.5 cleandomain = []
7 #iterate through each of the split domains
8 810900 971082.0 1.2 18.8 for x in index:
9 #if it isn't a string, then print the value directly in the cleandomain list
10 810000 1331253.0 1.6 25.8 if not isinstance(x, str):
11 810000 2819163.0 3.5 54.6 cleandomain.append(str(x))
12 #if it's a string that encapsulates numbers, then it's an IP
13 elif str(x)[-1].isnumeric():
14 try:
15 cleandomain.append(str(x[0])+'.'+str(x[1])+'.*.*')
16 except IndexError:
17 cleandomain.append(str(x))
18 #if its in the CDN list, take a subdomain as well
19 elif len(x) > 3 and str(x[len(x)-2]).rstrip() in cdns:
20 try:
21 cleandomain.append(str(x[len(x)-3])+'.'+str(x[len(x)-2])+'.'+str(x[len(x)-1]))
22 except IndexError:
23 cleandomain.append(str(x))
24 elif len(x) > 3 and str(x[len(x)-3]).rstrip() in cdns:
25 try:
26 cleandomain.append(str(x[len(x)-4])+'.'+str(x[len(x)-3])+'.'+str(x[len(x)-2])+'.'+ str(x[len(x)-1]))
27 except IndexError:
28 cleandomain.append(str(x))
29 #if its in the TLD list, do this
30 elif len(x) > 2 and str(x[len(x)-2]).rstrip()+'.'+ str(x[len(x)-1]).rstrip() in tld:
31 try:
32 cleandomain.append(str(x[len(x)-3])+'.'+str(x[len(x)-2])+'.'+ str(x[len(x)-1]))
33 except IndexError:
34 cleandomain.append(str(x))
35 elif len(x) > 2 and str(x[len(x)-1]) in tld:
36 try:
37 cleandomain.append(str(x[len(x)-2])+'.'+ str(x[len(x)-1]))
38 except IndexError:
39 cleandomain.append(str(x))
40 #if its not in the TLD list, do this
41 else:
42 cleandomain.append(str(x))
我的代码:
数据准备:
from io import StringIO
import pandas as pd
#import file into dataframe
TESTDATA=StringIO("""Domain,Use
graph.facebook.com, 4242
news.bbc.co.uk, 23423
news.more.news.bbc.co.uk, 234432
profile.username.co, 235523
offers.o2.co.uk, 235523
subdomain.pyspark.org, 2325
uds.data.domain.net, 23523
domain.akamai.net, 23532
333.333.333.333,3432324
""")
df=pd.read_csv(TESTDATA)
df["Domain"] = df.Domain.str.strip()
df = pd.concat([df]*100)
df2 = df
#extract only 2 columns from dataframe
df3 = df2
#define tld and cdn lookup lists
tld = ['co.uk', 'com', 'org', 'gov.uk', 'co', 'net', 'news', 'it', 'in' 'es', 'tw', 'pe', 'io', 'ca', 'cat', 'com.au',
'com.ar', 'com.mt', 'com.co', 'ws', 'to', 'es', 'de', 'us', 'br', 'im', 'gr', 'cc', 'cn', 'org.uk', 'me', 'ovh', 'be',
'tv', 'tech', '..', 'life', 'com.mx', 'pl', 'uk', 'ru', 'cz', 'st', 'info', 'mobi', 'today', 'eu', 'fi', 'jp', 'life',
'1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'earth', 'ninja', 'ie', 'im', 'ai', 'at', 'ch', 'ly', 'market', 'click',
'fr', 'nl', 'se']
cdns = ['akamai', 'maxcdn', 'cloudflare']
jupyter笔记本中的时间:
%%timeit
index = df3.Domain.str.split('.').tolist()
#iterate through each row of the datafrme and split each domain at the dot
for row in df2.itertuples():
cleandomain = []
#iterate through each of the split domains
for x in index:
#if it isn't a string, then print the value directly in the cleandomain list
if not isinstance(x, str):
cleandomain.append(str(x))
#if it's a string that encapsulates numbers, then it's an IP
elif str(x)[-1].isnumeric():
try:
cleandomain.append(str(x[0])+'.'+str(x[1])+'.*.*')
except IndexError:
cleandomain.append(str(x))
#if its in the CDN list, take a subdomain as well
elif len(x) > 3 and str(x[len(x)-2]).rstrip() in cdns:
try:
cleandomain.append(str(x[len(x)-3])+'.'+str(x[len(x)-2])+'.'+str(x[len(x)-1]))
except IndexError:
cleandomain.append(str(x))
elif len(x) > 3 and str(x[len(x)-3]).rstrip() in cdns:
try:
cleandomain.append(str(x[len(x)-4])+'.'+str(x[len(x)-3])+'.'+str(x[len(x)-2])+'.'+ str(x[len(x)-1]))
except IndexError:
cleandomain.append(str(x))
#if its in the TLD list, do this
elif len(x) > 2 and str(x[len(x)-2]).rstrip()+'.'+ str(x[len(x)-1]).rstrip() in tld:
try:
cleandomain.append(str(x[len(x)-3])+'.'+str(x[len(x)-2])+'.'+ str(x[len(x)-1]))
except IndexError:
cleandomain.append(str(x))
elif len(x) > 2 and str(x[len(x)-1]) in tld:
try:
cleandomain.append(str(x[len(x)-2])+'.'+ str(x[len(x)-1]))
except IndexError:
cleandomain.append(str(x))
#if its not in the TLD list, do this
else:
cleandomain.append(str(x))
推荐阅读
- azure - 将域迁移到 Azure 后将 MS Outlook 邮箱添加到 AAD 用户
- javascript - 合并对象数组
- java - 确定一个比率以查看二叉搜索树在 java 中的平衡程度
- codeigniter - 如何将两列相乘并在codeigniter活动记录中求和?
- node.js - 运行繁重的预定 MongoDB 查询
- opengl - 如何处理 OpenGL 实现之间的差异
- swiftui - 在 macOS 上的 SwiftUI 中将颜色从源拖放到目标
- c - 在 K&R 字符计数示例的“C 编程语言”一书中
- flutter - 在带有颤振图的水平条形图上显示自定义目标线
- c++ - C++ 大数