import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
Hierarchical Clustering
Hierarchical Clustering
# Create a dataset with K_dataset clusters
= 3
K_dataset
= make_blobs(n_samples=10, centers=K_dataset, n_features=2, random_state=0)
X, y
0], X[:, 1], c=y, s=200, cmap='viridis')
plt.scatter(X[:, # Annotate sample number
for i in range(X.shape[0]):
0], X[i, 1]), fontsize=20) plt.annotate(i, (X[i,
# Show after 1 step of hierarchical clustering
from scipy.cluster.hierarchy import dendrogram, linkage
= linkage(X, 'average')
Z =(10, 5))
plt.figure(figsize
=range(X.shape[0]), leaf_rotation=90, leaf_font_size=20)
dendrogram(Z, labels plt.show()
# Pairwise distance matrix
from scipy.spatial.distance import pdist, squareform
# Compute the distance matrix
= pdist(X, metric='euclidean')
dist = squareform(dist)
dist = [f"P{x}" for x in np.arange(X.shape[0])]
X_index = pd.DataFrame(dist, index=X_index, columns=X_index)
df df
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | |
---|---|---|---|---|---|---|---|---|---|---|
P0 | 0.000000 | 1.468503 | 5.849187 | 1.072565 | 4.001437 | 4.311101 | 5.641206 | 5.617833 | 4.732056 | 1.796596 |
P1 | 1.468503 | 0.000000 | 4.427098 | 0.911272 | 3.289419 | 3.124431 | 5.879548 | 4.149512 | 3.283718 | 0.332095 |
P2 | 5.849187 | 4.427098 | 0.000000 | 4.904326 | 3.705648 | 2.158345 | 7.350236 | 1.347556 | 1.194950 | 4.132565 |
P3 | 1.072565 | 0.911272 | 4.904326 | 0.000000 | 2.966927 | 3.253476 | 5.083095 | 4.830950 | 3.843925 | 1.193839 |
P4 | 4.001437 | 3.289419 | 3.705648 | 2.966927 | 0.000000 | 1.575484 | 3.691478 | 4.465344 | 3.299630 | 3.257210 |
P5 | 4.311101 | 3.124431 | 2.158345 | 3.253476 | 1.575484 | 0.000000 | 5.263313 | 2.910477 | 1.771561 | 2.937847 |
P6 | 5.641206 | 5.879548 | 7.350236 | 5.083095 | 3.691478 | 5.263313 | 0.000000 | 8.154381 | 6.982831 | 6.034281 |
P7 | 5.617833 | 4.149512 | 1.347556 | 4.830950 | 4.465344 | 2.910477 | 8.154381 | 0.000000 | 1.180379 | 3.821671 |
P8 | 4.732056 | 3.283718 | 1.194950 | 3.843925 | 3.299630 | 1.771561 | 6.982831 | 1.180379 | 0.000000 | 2.976718 |
P9 | 1.796596 | 0.332095 | 4.132565 | 1.193839 | 3.257210 | 2.937847 | 6.034281 | 3.821671 | 2.976718 | 0.000000 |
import seaborn as sns
# use lower triangle of matrix
= np.zeros_like(dist, dtype=np.bool_)
mask = True
mask[np.triu_indices_from(mask)] =(10, 10))
plt.figure(figsize# Make heatmap with seaborn and mask, mentining column and row names
=mask, xticklabels=X_index, yticklabels=X_index, annot=True, cmap='viridis') sns.heatmap(dist, mask
<AxesSubplot:>
# Combine closest clusters
# (i.e. the two clusters with the smallest distance)
# Find the two clusters with the smallest distance ignoring the diagonal
# (i.e. the distance between a cluster and itself)
np.fill_diagonal(dist, np.inf)= np.unravel_index(np.argmin(dist), dist.shape)
i, jprint(i, j)
1 9
# Combine the two clusters to a single "point"
= np.vstack([X[i], X[j]])
X_new = f'{i} & {j}'
new_point_name = X_new.mean(axis=0)
new_point_value print(new_point_name, new_point_value)
# Add new point to the dataset, remove the two old points
= np.vstack([X, new_point_value])
X = np.delete(X, [i, j], axis=0)
X print(X.shape)
#Update X_index to reflect the new point and the removed points
= [f"P{x}" for x in range(10) if x not in [i, j]]
X_index
X_index.append(new_point_name)
# Compute the distance matrix
= pdist(X, metric='euclidean')
dist = squareform(dist)
dist = pd.DataFrame(dist, index=X_index, columns=X_index) df
1 & 9 [1.83183315 4.28894623]
(9, 2)
df
P0 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | 1 & 9 | |
---|---|---|---|---|---|---|---|---|---|
P0 | 0.000000 | 5.849187 | 1.072565 | 4.001437 | 4.311101 | 5.641206 | 5.617833 | 4.732056 | 1.632347 |
P2 | 5.849187 | 0.000000 | 4.904326 | 3.705648 | 2.158345 | 7.350236 | 1.347556 | 1.194950 | 4.279144 |
P3 | 1.072565 | 4.904326 | 0.000000 | 2.966927 | 3.253476 | 5.083095 | 4.830950 | 3.843925 | 1.048934 |
P4 | 4.001437 | 3.705648 | 2.966927 | 0.000000 | 1.575484 | 3.691478 | 4.465344 | 3.299630 | 3.269140 |
P5 | 4.311101 | 2.158345 | 3.253476 | 1.575484 | 0.000000 | 5.263313 | 2.910477 | 1.771561 | 3.028025 |
P6 | 5.641206 | 7.350236 | 5.083095 | 3.691478 | 5.263313 | 0.000000 | 8.154381 | 6.982831 | 5.955102 |
P7 | 5.617833 | 1.347556 | 4.830950 | 4.465344 | 2.910477 | 8.154381 | 0.000000 | 1.180379 | 3.985504 |
P8 | 4.732056 | 1.194950 | 3.843925 | 3.299630 | 1.771561 | 6.982831 | 1.180379 | 0.000000 | 3.129577 |
1 & 9 | 1.632347 | 4.279144 | 1.048934 | 3.269140 | 3.028025 | 5.955102 | 3.985504 | 3.129577 | 0.000000 |
= np.zeros_like(dist, dtype=np.bool_)
mask = True
mask[np.triu_indices_from(mask)] =(10, 10))
plt.figure(figsize# Make heatmap with seaborn and mask, mentining column and row names
=mask, xticklabels=X_index, yticklabels=X_index, annot=True, cmap='viridis') sns.heatmap(dist, mask
<AxesSubplot:>