python - 找到具有缩放和非缩放数据的最佳集群数量的问题
问题描述
我正在尝试对我的数据进行聚类,但在确定最佳聚类数时遇到了一些问题。
我的数据(https://www.dropbox.com/s/6i6wyy0eohtlrrt/wellA.xlsx?dl=0)是一口石油勘探井,包含深度、rock_types(标签)和岩石特性(特征)的信息。我有标签信息,但我想看看 KMeans 如何处理这个问题。
问题是肘部方法和轮廓分数在数据未缩放时显示出明显的趋势,但聚类不良。另一方面,缩放数据显示更好的集群,但其图形具有“奇怪”的形状……第一个没有“肘部”,另一个的轮廓分数比非缩放数据小得多。为什么我看到缩放数据的图表更差?
我想知道我是否做错了什么。这些特征是高度可变的,我认为它们应该为 KMeans 目的而缩放。也许我应该在找到最佳集群数量后缩放数据?
PS:很抱歉问题和代码很长(大部分都是情节)。我试图在一个更简单的示例中编辑所有这些,但我无法表示这种异质性。
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
data = pd.read_excel(r'C:\...\wellA.xlsx')
data = data.replace(-999.25, np.nan)
data.dropna(axis=0, inplace=True)
# FEATURES SELECTION FOR TRAINING
well = data.drop(['DEPTH','ROCK_TYPE'], axis=1)
# NORMALIZATION
scaled_well = pd.DataFrame(MinMaxScaler().fit_transform(well))
# ELBOW METHOD AND SILHOUETTE SCORE
def optimal_k(data, title):
inertia =[]
sil =[]
for k in range(2,14):
kmeans_rand = KMeans(n_clusters=k, init='k-means++', random_state=0)
kmeans_rand.fit(data.values)
y_pred = kmeans_rand.predict(data.values)
inertia.append(kmeans_rand.inertia_)
sil.append((k, silhouette_score(data.values, y_pred)))
fig, ax = plt.subplots(1, 2, figsize=(12,4))
ax[0].plot(range(2,14), inertia)
ax[0].set_title('Elbow Method')
ax[0].set_xlabel('Number of clusters')
ax[0].set_ylabel('Inertia')
x_sil = [x[0] for x in sil]
y_sil = [x[1] for x in sil]
ax[1].plot(x_sil, y_sil)
ax[1].set_xlabel('Number of Clusters')
ax[1].set_ylabel('Silhouetter Score')
ax[1].set_title('Silhouetter Score Curve')
fig.suptitle(title)
optimal_k(well, 'Not scaled')
optimal_k(scaled_well, 'Scaled')
# MODEL
def kmeans(data, k):
model = KMeans(n_clusters=k, random_state=0, init='k-means++')
model.fit(data.values)
labels = model.labels_
data['KMEANS'] = labels+1
kmeans(well,3)
kmeans(scaled_well,3)
# CONVERT NAME TO VALUE
facies = {'Claystone':1, 'Coal':2, 'Limestone':3, 'Marl':4, 'Sandstone':5}
data['LABEL'] = data['ROCK_TYPE'].map(facies)
# PLOT
cluster_real = np.repeat(np.expand_dims(data['LABEL'], 1), 1, 1)
cluster_kmeans = np.repeat(np.expand_dims(well['KMEANS'], 1), 1, 1)
cluster_kmeans_scaled = np.repeat(np.expand_dims(scaled_well['KMEANS'], 1), 1, 1)
f, ax = plt.subplots(nrows=1, ncols=3, figsize=(2,12))
ax[0].imshow(cluster_real,
interpolation='none',
aspect='auto',
vmin=1, vmax=5,
extent=[0, 1, data['DEPTH'].max(), data['DEPTH'].min()])
ax[1].imshow(cluster_kmeans,
interpolation='none',
aspect='auto',
vmin=1, vmax=3,
extent=[0, 1, data['DEPTH'].max(), data['DEPTH'].min()])
ax[2].imshow(cluster_kmeans_scaled,
interpolation='none',
aspect='auto',
vmin=1, vmax=3,
extent=[0, 1, data['DEPTH'].max(), data['DEPTH'].min()])
ax[0].set_ylabel('Depth (m)')
ax[0].set_xticks([],[])
ax[0].set_xlabel('REAL ROCKS')
ax[1].set_xticks([],[])
ax[1].set_xlabel('KMEANS')
ax[2].set_xticks([],[])
ax[2].set_xlabel('KMEANS SCALED')
解决方案
一些聚类方法会自动为您找到最佳数量的聚类。亲和传播和均值偏移是我想到的两个。可能还有其他几个人会这样做。
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from sklearn.datasets import make_blobs
# #############################################################################
# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=300, centers=centers, cluster_std=0.5,
random_state=0)
# #############################################################################
# Compute Affinity Propagation
af = AffinityPropagation(preference=-50).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters_ = len(cluster_centers_indices)
print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
% metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
% metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(X, labels, metric='sqeuclidean'))
# #############################################################################
# Plot result
import matplotlib.pyplot as plt
from itertools import cycle
plt.close('all')
plt.figure(1)
plt.clf()
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
class_members = labels == k
cluster_center = X[cluster_centers_indices[k]]
plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
for x in X[class_members]:
plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
推荐阅读
- azure-devops - AzureDevOps 测试计划进度报告中的详细信息部分不显示测试套件下的测试
- postgresql - 使用 Last_value() 用 Last not null 填充 null 值
- node.js - 猫鼬错误:检测到循环依赖
- apache-flink - Flink 水印策略
- git - 无法使用正则表达式在 git hub 中创建有效的分支名称
- python - 打印div内容python
- linux - 无法退出源文件中的函数
- next.js - Next.js 以线性渐变为背景的图像组件
- java - 我无法将对象添加到我的数据库中,因为我无法在后台线程中初始化 Dao 对象。甚至 db.PetsDao() 也无济于事
- r - 从 R 中的列中搜索单词/短语