import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
Number of clusters in K-means
Number of clusters in K-means
# Create a dataset with K_dataset clusters
= 5
K_dataset
= make_blobs(n_samples=1000, centers=K_dataset, n_features=2, random_state=0)
X, y
0], X[:, 1], c=y, s=10, cmap='viridis') plt.scatter(X[:,
<matplotlib.collections.PathCollection at 0x7f8e151bfbb0>
# Fit k-means with different number of clusters
= range(1, 10)
Ks = {}
within_cluster_sums = {}
assignments = {}
centroids
for K in Ks:
= KMeans(n_clusters=K, random_state=0)
kmeans
kmeans.fit(X)= kmeans.inertia_
within_cluster_sums[K] = kmeans.labels_
assignments[K] = kmeans.cluster_centers_
centroids[K]
# Plot the results
='o')
pd.Series(within_cluster_sums).plot(marker'Number of clusters')
plt.xlabel('Within-cluster sum of squares') plt.ylabel(
Text(0, 0.5, 'Within-cluster sum of squares')
# Plot assignments
= plt.subplots(3, 3, figsize=(12, 12))
fig, axes for K, ax in zip(Ks, axes.ravel()):
0], X[:, 1], c=assignments[K], s=10, cmap='viridis')
ax.scatter(X[:, f'K={K}')
ax.set_title(# Mark centroids
0], centroids[K][:, 1], c='red', s=100, alpha=0.5) ax.scatter(centroids[K][:,