首页 > 解决方案 > 找到具有缩放和非缩放数据的最佳集群数量的问题

问题描述

我正在尝试对我的数据进行聚类,但在确定最佳聚类数时遇到了一些问题。

我的数据(https://www.dropbox.com/s/6i6wyy0eohtlrrt/wellA.xlsx?dl=0)是一口石油勘探井,包含深度、rock_types(标签)和岩石特性(特征)的信息。我有标签信息,但我想看看 KMeans 如何处理这个问题。

问题是肘部方法和轮廓分数在数据未缩放时显示出明显的趋势,但聚类不良。另一方面,缩放数据显示更好的集群,但其图形具有“奇怪”的形状……第一个没有“肘部”,另一个的轮廓分数比非缩放数据小得多。为什么我看到缩放数据的图表更差? 在此处输入图像描述

我想知道我是否做错了什么。这些特征是高度可变的,我认为它们应该为 KMeans 目的而缩放。也许我应该在找到最佳集群数量后缩放数据?

PS:很抱歉问题和代码很长(大部分都是情节)。我试图在一个更简单的示例中编辑所有这些,但我无法表示这种异质性。

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

data = pd.read_excel(r'C:\...\wellA.xlsx') 
data = data.replace(-999.25, np.nan)
data.dropna(axis=0, inplace=True)

# FEATURES SELECTION FOR TRAINING
well = data.drop(['DEPTH','ROCK_TYPE'], axis=1)

# NORMALIZATION
scaled_well = pd.DataFrame(MinMaxScaler().fit_transform(well))

# ELBOW METHOD AND SILHOUETTE SCORE
def optimal_k(data, title):
    inertia =[]
    sil =[]
    
    for k in range(2,14):
        kmeans_rand = KMeans(n_clusters=k, init='k-means++', random_state=0)
        kmeans_rand.fit(data.values)
        y_pred = kmeans_rand.predict(data.values)
        
        inertia.append(kmeans_rand.inertia_)
        sil.append((k, silhouette_score(data.values, y_pred)))
        
    fig, ax = plt.subplots(1, 2, figsize=(12,4))
    ax[0].plot(range(2,14), inertia)
    ax[0].set_title('Elbow Method')
    ax[0].set_xlabel('Number of clusters')
    ax[0].set_ylabel('Inertia')
    
    x_sil = [x[0] for x in sil]
    y_sil = [x[1] for x in sil]
    ax[1].plot(x_sil, y_sil)
    ax[1].set_xlabel('Number of Clusters')
    ax[1].set_ylabel('Silhouetter Score')
    ax[1].set_title('Silhouetter Score Curve')
    
    fig.suptitle(title)
    
optimal_k(well, 'Not scaled')
optimal_k(scaled_well, 'Scaled')

# MODEL
def kmeans(data, k):
    model = KMeans(n_clusters=k, random_state=0, init='k-means++')
    model.fit(data.values)
    labels = model.labels_
    data['KMEANS'] = labels+1

kmeans(well,3)
kmeans(scaled_well,3)

# CONVERT NAME TO VALUE
facies = {'Claystone':1, 'Coal':2, 'Limestone':3, 'Marl':4, 'Sandstone':5}
data['LABEL'] = data['ROCK_TYPE'].map(facies)

# PLOT
cluster_real = np.repeat(np.expand_dims(data['LABEL'], 1), 1, 1)
cluster_kmeans = np.repeat(np.expand_dims(well['KMEANS'], 1), 1, 1)
cluster_kmeans_scaled = np.repeat(np.expand_dims(scaled_well['KMEANS'], 1), 1, 1)

f, ax = plt.subplots(nrows=1, ncols=3, figsize=(2,12))
ax[0].imshow(cluster_real,
   interpolation='none',
   aspect='auto',
   vmin=1, vmax=5,
   extent=[0, 1, data['DEPTH'].max(), data['DEPTH'].min()])

ax[1].imshow(cluster_kmeans,
   interpolation='none',
   aspect='auto',
   vmin=1, vmax=3,
   extent=[0, 1, data['DEPTH'].max(), data['DEPTH'].min()])

ax[2].imshow(cluster_kmeans_scaled,
   interpolation='none',
   aspect='auto',
   vmin=1, vmax=3,
   extent=[0, 1, data['DEPTH'].max(), data['DEPTH'].min()])

ax[0].set_ylabel('Depth (m)')
ax[0].set_xticks([],[])
ax[0].set_xlabel('REAL ROCKS')

ax[1].set_xticks([],[])
ax[1].set_xlabel('KMEANS')

ax[2].set_xticks([],[])
ax[2].set_xlabel('KMEANS SCALED')

标签: pythonperformancecluster-analysisk-meansunsupervised-learning

解决方案


一些聚类方法会自动为您找到最佳数量的聚类。亲和传播和均值偏移是我想到的两个。可能还有其他几个人会这样做。

from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from sklearn.datasets import make_blobs

# #############################################################################
# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=300, centers=centers, cluster_std=0.5,
                            random_state=0)

# #############################################################################
# Compute Affinity Propagation
af = AffinityPropagation(preference=-50).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

n_clusters_ = len(cluster_centers_indices)

print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))

# #############################################################################
# Plot result
import matplotlib.pyplot as plt
from itertools import cycle

plt.close('all')
plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    class_members = labels == k
    cluster_center = X[cluster_centers_indices[k]]
    plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)
    for x in X[class_members]:
        plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

在此处输入图像描述


推荐阅读