python - 为什么这个内核感知器实现需要无限的时间来运行?
问题描述
我正在尝试在数据集上应用内核感知器算法。所以我已经编写了代码并运行它。它工作正常,但是当我尝试绘制决策边界时,它需要无限的时间来运行。这里我附上代码
# All the import statements
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import random
import pylab as pl
#Kernel Perceptron class where I wrote fit and predict functions
def linear_kernel(x1, x2):
return np.dot(x1, x2)
def polynomial_kernel(x, y, p=3):
return (1 + np.dot(x, y)) ** p
class KernelPerceptron(object):
def __init__(self, kernel=linear_kernel, T=1):
self.kernel = kernel
self.T = T
def fit(self, X, y):
n_samples, n_features = X.shape
#np.hstack((X, np.ones((n_samples, 1))))
self.alpha = np.zeros(n_samples, dtype=np.float64)
# Gram matrix
K = np.zeros((n_samples, n_samples))
for i in range(n_samples):
for j in range(n_samples):
K[i,j] = self.kernel(X[i], X[j])
for t in range(self.T):
for i in range(n_samples):
if np.sign(np.sum(K[:,i] * self.alpha * y)) != y[i]:
self.alpha[i] += 1.0
# Support vectors
sv = self.alpha > 1e-5
ind = np.arange(len(self.alpha))[sv]
self.alpha = self.alpha[sv]
self.sv = X[sv]
self.sv_y = y[sv]
print (len(self.alpha), n_samples)
def project(self, X):
y_predict = np.zeros(len(X))
for i in range(len(X)):
s = 0
for a, sv_y, sv in zip(self.alpha, self.sv_y, self.sv):
s += a * sv_y * self.kernel(X[i], sv)
y_predict[i] = s
return y_predict
def predict(self, X):
X = np.atleast_2d(X)
n_samples, n_features = X.shape
#np.hstack((X, np.ones((n_samples, 1))))
return np.sign(self.project(X))
#Testing on the dataset I have
data = pd.read_csv("Dataset_1_Team_35.csv").to_numpy()
points = []
labels = []
i = 0
while i<1000 :
l = []
l.append(data[i][0])
l.append(data[i][1])
points.append(l)
labels.append(data[i][2])
i+=1
X = np.array(points)
y = np.array(labels)
# print(type(X),type(y),len(X),len(y))
print(X.shape,y.shape)
xtr,xts,ytr,yts = train_test_split(X,y,test_size = 0.2)
print(xtr.shape,ytr.shape)
clf =KernelPerceptron(polynomial_kernel , 2)
clf.fit(xtr,ytr)
pred = clf.predict(xtr)
val = accuracy_score(pred,ytr)
print(val)
#Code for plotting the decision boundary
def make_meshgrid(x, y, h=.02):
x_min, x_max = x.min() - 1, x.max() + 1
y_min, y_max = y.min() - 1, y.max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
return xx, yy
def plot_contours(ax, clf, xx, yy, **params):
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
out = ax.contourf(xx, yy, Z, **params)
return out
fig, ax = plt.subplots()
X0, X1 = xtr[:, 0], xtr[:, 1]
xx, yy = make_meshgrid(X0, X1)
plot_contours(ax, clf, xx, yy, cmap=plt.cm.coolwarm, alpha=1) #line taking infinite time to load
ax.scatter(X0, X1, c=ytr, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
ax.set_title(title)
所以我尝试调试并尝试在jupyter笔记本上逐行运行它,发现该行
plot_contours(ax, clf, xx, yy, cmap=plt.cm.coolwarm, alpha=1)
是否需要无限的时间来运行?当我尝试在另一个数据集上运行该算法时,它需要更少的时间来运行。
谁能帮我这个?
如果有人想要数据集,它是一个简单的数据集,其中包含 1000 个条目(点及其相应的标签)。链接到数据集。
解决方案
这不是花费无限的时间,只是在训练中你要拟合 800 个数据点,然后对 800 个数据点进行预测,但是当你创建一个图时,你有 28889748 个数据点,因此需要花费很多时间。
为了在创建绘图时减少这么多数据点,我建议做两件事:
1)使用标准标量对数据进行归一化
2)在 make_meshgrid 函数中创建网格时增加步长(例如从 0.02 到 0.2)
这是修改后的代码:
# All the import statements
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import random
import pylab as pl
#Kernel Perceptron class where I wrote fit and predict functions
def linear_kernel(x1, x2):
return np.dot(x1, x2)
def polynomial_kernel(x, y, p=3):
return (1 + np.dot(x, y)) ** p
class KernelPerceptron(object):
def __init__(self, kernel=linear_kernel, T=1):
self.kernel = kernel
self.T = T
def fit(self, X, y):
n_samples, n_features = X.shape
#np.hstack((X, np.ones((n_samples, 1))))
self.alpha = np.zeros(n_samples, dtype=np.float64)
# Gram matrix
K = np.zeros((n_samples, n_samples))
for i in range(n_samples):
for j in range(n_samples):
K[i,j] = self.kernel(X[i], X[j])
for t in range(self.T):
for i in range(n_samples):
if np.sign(np.sum(K[:,i] * self.alpha * y)) != y[i]:
self.alpha[i] += 1.0
# Support vectors
sv = self.alpha > 1e-5
ind = np.arange(len(self.alpha))[sv]
self.alpha = self.alpha[sv]
self.sv = X[sv]
self.sv_y = y[sv]
print (len(self.alpha), n_samples)
def project(self, X):
y_predict = np.zeros(len(X))
print(f'data points len: {len(X)}')
for i in range(len(X)):
# print('dbg3.2')
s = 0
for a, sv_y, sv in zip(self.alpha, self.sv_y, self.sv):
s += a * sv_y * self.kernel(X[i], sv)
y_predict[i] = s
return y_predict
def predict(self, X):
X = np.atleast_2d(X)
n_samples, n_features = X.shape
#np.hstack((X, np.ones((n_samples, 1))))
return np.sign(self.project(X))
#Testing on the dataset I have
data = pd.read_csv("Dataset_1_Team_35.csv").to_numpy()
points = []
labels = []
i = 0
while i<1000 :
l = []
l.append(data[i][0])
l.append(data[i][1])
points.append(l)
labels.append(data[i][2])
i+=1
X = np.array(points)
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
y = np.array(labels)
# print(type(X),type(y),len(X),len(y))
print(X.shape,y.shape)
xtr,xts,ytr,yts = train_test_split(X,y,test_size = 0.2)
print(xtr.shape,ytr.shape)
clf = KernelPerceptron(polynomial_kernel , 2)
clf.fit(xtr,ytr)
print(f'xtr: {xtr}')
pred = clf.predict(xtr)
val = accuracy_score(pred,ytr)
print(val)
#Code for plotting the decision boundary
def make_meshgrid(x, y, h=.02):
x_min, x_max = x.min() - 1, x.max() + 1
y_min, y_max = y.min() - 1, y.max() + 1
# print(f'xmin: {x_min}, xmax: {x_max}, ymin: {y_min}, ymax: {y_max}')
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
return xx, yy
def plot_contours(ax, clf, xx, yy, **params):
# print(f'np.c_[xx.ravel(), yy.ravel()]: {np.c_[xx.ravel(), yy.ravel()]}')
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
out = ax.contourf(xx, yy, Z, **params)
return out
fig, ax = plt.subplots()
X0, X1 = xtr[:, 0], xtr[:, 1]
xx, yy = make_meshgrid(X0, X1, 0.2)
plot_contours(ax, clf, xx, yy, cmap=plt.cm.coolwarm, alpha=1) #line taking infinite time to load
ax.scatter(X0, X1, c=ytr, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
ax.set_title('title')
plt.show()
推荐阅读
- ios - 如何设置一个整数值作为导航栏的标题?
- eloquent - 雄辩,如果两个表中的任何一个中都存在一个值,如何选择行?
- regex - 用于匹配最后一行的正则表达式
- java - ListView 不显示在屏幕上
- r - 如何生成非功能的平均曲线?
- reactjs - 可以在 React 中像这样在 setState 中进行回调吗?
- asp.net - 剃刀页面“asp-page”未链接到指定位置,而是链接到当前页面
- r - dplyr 分组帮助,如何根据另一列的最小值从另一列中选择一个值
- c# - 指定类型未在目标服务器 EF Core 上注册以进行空间点批量插入
- c# - Google Cloud Datastore 从 VB.net 进行仅键查询