import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
KMeans initialisation
KMeans initialisation
from sklearn.datasets import make_blobs
= make_blobs(n_samples=1000, centers=3, n_features=2, random_state=42, cluster_std=3.5)
X, y 0], X[:, 1], c=y, s=50, cmap='viridis'); plt.scatter(X[:,
# Run k-means with 3 clusters with various initializations
from sklearn.cluster import KMeans
= plt.subplots(2, 6, figsize=(24, 6))
fig, ax for i, init in enumerate(['random', 'k-means++']):
for j, random_state in enumerate(range(6)):
= KMeans(n_clusters=5, random_state=random_state*10, init=init)
km
km.fit(X)= km.predict(X)
y_km 0], X[:, 1], c=y_km, s=50, cmap='viridis')
ax[i, j].scatter(X[:, 0], km.cluster_centers_[:, 1], s=200, marker='*', c='red')
ax[i, j].scatter(km.cluster_centers_[:, f'init={init}, random_state={random_state}\n inertia={km.inertia_:.2f}')
ax[i, j].set_title(
fig.tight_layout()
We get the same result with different initializations!
# Run k-means with 3 clusters with various initializations
from sklearn.cluster import KMeans
= plt.subplots(2, 6, figsize=(24, 6))
fig, ax for i, init in enumerate(['random', 'k-means++']):
for j, random_state in enumerate(range(6)):
= KMeans(n_clusters=5, random_state=random_state*10, init=init,
km =1)
n_init
km.fit(X)= km.predict(X)
y_km 0], X[:, 1], c=y_km, s=50, cmap='viridis')
ax[i, j].scatter(X[:, 0], km.cluster_centers_[:, 1], s=200, marker='*', c='red')
ax[i, j].scatter(km.cluster_centers_[:, f'init={init}, random_state={random_state}\n inertia={km.inertia_:.2f}')
ax[i, j].set_title(
fig.tight_layout()
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
= make_blobs(n_samples=200, centers=3, n_features=2, random_state=42, cluster_std=3.5)
X, y
= []
random_inertias = []
kmeans_inertias
# Run KMeans with random initialization and KMeans++ initialization
# for various number of clusters
for k in range(2, 7):
random_inertias.append([])
kmeans_inertias.append([])for i in range(10):
#print(i, k)
= KMeans(n_clusters=k, init='random', n_init=1, random_state=i)
km_random = KMeans(n_clusters=k, init='k-means++', n_init=1, random_state=i)
km_kmeans
km_random.fit(X)
km_kmeans.fit(X)-1].append(km_random.inertia_)
random_inertias[-1].append(km_kmeans.inertia_) kmeans_inertias[
import pandas as pd
= pd.DataFrame({'random': np.array(random_inertias).mean(axis=1),
df 'kmeans++': np.array(kmeans_inertias).mean(axis=1)},
=range(2, 7))
index= pd.DataFrame({'random': np.array(random_inertias).std(axis=1),
yerr 'kmeans++': np.array(kmeans_inertias).std(axis=1)},
=range(2, 7))
index= df.plot(kind='bar', yerr=yerr, figsize=(10, 6), rot=0)
ax
# Let us rather work in higher dimensions
= make_blobs(n_samples=500, centers=3, n_features=10, random_state=42, cluster_std=3.5)
X, y
= []
random_inertias = []
kmeans_inertias
# Run KMeans with random initialization and KMeans++ initialization
# for various number of clusters
for k in range(2, 7):
random_inertias.append([])
kmeans_inertias.append([])for i in range(10):
#print(i, k)
= KMeans(n_clusters=k, init='random', n_init=1, random_state=i)
km_random = KMeans(n_clusters=k, init='k-means++', n_init=1, random_state=i)
km_kmeans
km_random.fit(X)
km_kmeans.fit(X)-1].append(km_random.inertia_)
random_inertias[-1].append(km_kmeans.inertia_) kmeans_inertias[
= pd.DataFrame({'random': np.array(random_inertias).mean(axis=1),
df 'kmeans++': np.array(kmeans_inertias).mean(axis=1)},
=range(2, 7))
index= pd.DataFrame({'random': np.array(random_inertias).std(axis=1),
yerr 'kmeans++': np.array(kmeans_inertias).std(axis=1)},
=range(2, 7))
index= df.plot(kind='bar', yerr=yerr, figsize=(10, 6), rot=0) ax