首页 > 解决方案 > 两组数据点之间的聚类 - Python

问题描述

我希望使用 k-means 聚类来绘制并返回每个聚类质心的位置。下面将两组 xy 散点分为 6 个簇。

使用下面的 df,将AandBCandD中的坐标绘制为散点图。我希望绘制并返回每个集群的质心。

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans

df = pd.DataFrame(np.random.randint(-50,50,size=(100, 4)), columns=list('ABCD'))

fig, ax = plt.subplots()

Y_sklearn = df[['A','B','C','D']].values
 
model = KMeans(n_clusters = 4)
model.fit(Y_sklearn)
plt.scatter(Y_sklearn[:,0],Y_sklearn[:,1], c = model.labels_); 
plt.scatter(Y_sklearn[:,2],Y_sklearn[:,3], c = model.labels_); 

plt.show()     

标签: pythonpandascluster-analysis

解决方案


根据您制作散点图的方式,我猜测AB对应于第一组点的 xy 坐标,而C并且D对应于第二组点的 xy 坐标。如果是这样,您不能Kmeans直接应用于数据框,因为只有两个特征,即 x 和 y 坐标。找到质心其实很简单,你只需要model_zero.cluster_centers_.

让我们首先构建一个更适合可视化的数据框

import numpy as np
# set the seed for reproducible datasets
np.random.seed(365)
# cov matrix of a 2d gaussian 
stds = np.eye(2)
# four cluster means 
means_zero = np.random.randint(10,20,(4,2))
sizes_zero = np.array([20,30,15,35])
# four cluster means 
means_one = np.random.randint(0,10,(4,2))
sizes_one = np.array([20,20,25,35])

points_zero = np.vstack([np.random.multivariate_normal(mean,stds,size=(size)) for mean,size in zip(means_zero,sizes_zero)])
points_one = np.vstack([np.random.multivariate_normal(mean,stds,size=(size)) for mean,size in zip(means_one,sizes_one)])
all_points = np.hstack((points_zero,points_one))

如您所见,这四个聚类是通过从四个高斯以不同的均值采样点构建的。使用此数据框,您可以按以下方式绘制它

import matplotlib.patheffects as PathEffects
from sklearn.cluster import KMeans

df = pd.DataFrame(all_points, columns=list('ABCD'))

fig, ax = plt.subplots(figsize=(10,8))

scatter_zero = df[['A','B']].values
scatter_one = df[['C','D']].values
 
model_zero = KMeans(n_clusters=4)
model_zero.fit(scatter_zero)
model_one = KMeans(n_clusters=4)
model_one.fit(scatter_one)

plt.scatter(scatter_zero[:,0],scatter_zero[:,1],c=model_zero.labels_,cmap='bwr'); 
plt.scatter(scatter_one[:,0],scatter_one[:,1],c=model_one.labels_,cmap='bwr'); 

# plot the cluster centers
txts = []
for ind,pos in enumerate(model_zero.cluster_centers_):
    txt = ax.text(pos[0],pos[1],
                  'cluster %i \n (%.1f,%.1f)' % (ind,pos[0],pos[1]),
                  fontsize=12,zorder=100)
    txt.set_path_effects([PathEffects.Stroke(linewidth=5, foreground="aquamarine"),PathEffects.Normal()])
    txts.append(txt)
for ind,pos in enumerate(model_one.cluster_centers_):
    txt = ax.text(pos[0],pos[1],
                  'cluster %i \n (%.1f,%.1f)' % (ind,pos[0],pos[1]),
                  fontsize=12,zorder=100)
    txt.set_path_effects([PathEffects.Stroke(linewidth=5, foreground="lime"),PathEffects.Normal()])
    txts.append(txt)
    
zero_mean = np.mean(model_zero.cluster_centers_,axis=0)
one_mean = np.mean(model_one.cluster_centers_,axis=0)
txt = ax.text(zero_mean[0],zero_mean[1],
              'point set zero',
              fontsize=15)
txt.set_path_effects([PathEffects.Stroke(linewidth=5, foreground="violet"),PathEffects.Normal()])
txts.append(txt)
txt = ax.text(one_mean[0],one_mean[1],
              'point set one',
              fontsize=15)
txt.set_path_effects([PathEffects.Stroke(linewidth=5, foreground="violet"),PathEffects.Normal()])
txts.append(txt)

plt.show()     

运行这段代码,你会得到

在此处输入图像描述


推荐阅读