I'm trying to do clustering in my data and I'm having some problems to identify the optimal number of clusters.
My data ( https://www.dropbox.com/s/6i6wyy0eohtlrrt/wellA.xlsx?dl=0 ) is an oil exploration well with information of depth, rock_types (label) and rock properties (features). I have the labels information, but I'd like to see how KMeans would work on this.
The problem is that the elbow method and silhouette score show a clear trend when the data isn't scaled, but a bad clustering. On the other hand, scaled data shows better clusters, but its graphs have "weird" shapes... the first one doesn't has an "elbow" and the other has silhouette scores way smaller than the non-scaled data. Why do I see worse graphs for scaled data?
I would like to know if I'm doing something wrong. The features are highly variables and I think they should be scaled for KMeans purposes. Maybe I should scale the data just after find the optimal number of clusters?
PS: I'm sorry for the long question and code (most of it are plots). I tried to edit all of this in a simpler example, but I wasn't able to represent this heterogeneity.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
data = pd.read_excel(r'C:\...\wellA.xlsx')
data = data.replace(-999.25, np.nan)
data.dropna(axis=0, inplace=True)
# FEATURES SELECTION FOR TRAINING
well = data.drop(['DEPTH','ROCK_TYPE'], axis=1)
# NORMALIZATION
scaled_well = pd.DataFrame(MinMaxScaler().fit_transform(well))
# ELBOW METHOD AND SILHOUETTE SCORE
def optimal_k(data, title):
inertia =[]
sil =[]
for k in range(2,14):
kmeans_rand = KMeans(n_clusters=k, init='k-means++', random_state=0)
kmeans_rand.fit(data.values)
y_pred = kmeans_rand.predict(data.values)
inertia.append(kmeans_rand.inertia_)
sil.append((k, silhouette_score(data.values, y_pred)))
fig, ax = plt.subplots(1, 2, figsize=(12,4))
ax[0].plot(range(2,14), inertia)
ax[0].set_title('Elbow Method')
ax[0].set_xlabel('Number of clusters')
ax[0].set_ylabel('Inertia')
x_sil = [x[0] for x in sil]
y_sil = [x[1] for x in sil]
ax[1].plot(x_sil, y_sil)
ax[1].set_xlabel('Number of Clusters')
ax[1].set_ylabel('Silhouetter Score')
ax[1].set_title('Silhouetter Score Curve')
fig.suptitle(title)
optimal_k(well, 'Not scaled')
optimal_k(scaled_well, 'Scaled')
# MODEL
def kmeans(data, k):
model = KMeans(n_clusters=k, random_state=0, init='k-means++')
model.fit(data.values)
labels = model.labels_
data['KMEANS'] = labels+1
kmeans(well,3)
kmeans(scaled_well,3)
# CONVERT NAME TO VALUE
facies = {'Claystone':1, 'Coal':2, 'Limestone':3, 'Marl':4, 'Sandstone':5}
data['LABEL'] = data['ROCK_TYPE'].map(facies)
# PLOT
cluster_real = np.repeat(np.expand_dims(data['LABEL'], 1), 1, 1)
cluster_kmeans = np.repeat(np.expand_dims(well['KMEANS'], 1), 1, 1)
cluster_kmeans_scaled = np.repeat(np.expand_dims(scaled_well['KMEANS'], 1), 1, 1)
f, ax = plt.subplots(nrows=1, ncols=3, figsize=(2,12))
ax[0].imshow(cluster_real,
interpolation='none',
aspect='auto',
vmin=1, vmax=5,
extent=[0, 1, data['DEPTH'].max(), data['DEPTH'].min()])
ax[1].imshow(cluster_kmeans,
interpolation='none',
aspect='auto',
vmin=1, vmax=3,
extent=[0, 1, data['DEPTH'].max(), data['DEPTH'].min()])
ax[2].imshow(cluster_kmeans_scaled,
interpolation='none',
aspect='auto',
vmin=1, vmax=3,
extent=[0, 1, data['DEPTH'].max(), data['DEPTH'].min()])
ax[0].set_ylabel('Depth (m)')
ax[0].set_xticks([],[])
ax[0].set_xlabel('REAL ROCKS')
ax[1].set_xticks([],[])
ax[1].set_xlabel('KMEANS')
ax[2].set_xticks([],[])
ax[2].set_xlabel('KMEANS SCALED')
Some clustering methodologies will automatically find the optimal number of clusters for you. Affinity Propagation and Mean Shift are two that come to mind. There are probably a couple others out there that will do the same.
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from sklearn.datasets import make_blobs
# #############################################################################
# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=300, centers=centers, cluster_std=0.5,
random_state=0)
# #############################################################################
# Compute Affinity Propagation
af = AffinityPropagation(preference=-50).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters_ = len(cluster_centers_indices)
print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
% metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
% metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(X, labels, metric='sqeuclidean'))
# #############################################################################
# Plot result
import matplotlib.pyplot as plt
from itertools import cycle
plt.close('all')
plt.figure(1)
plt.clf()
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
class_members = labels == k
cluster_center = X[cluster_centers_indices[k]]
plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
for x in X[class_members]:
plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.