Principal Component Analysis

ML
Author

Nipun Batra

Published

April 15, 2025

import matplotlib.pyplot as plt
import numpy as np
print(np.__version__)
import torch 
import torch.nn as nn

import pandas as pd
# Retina mode
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from sklearn.datasets import make_blobs
from torchvision import datasets, transforms
1.26.4
# Simple 2D blobs with say 0.7 correlation
X = torch.distributions.multivariate_normal.MultivariateNormal(
    torch.tensor([5.0, -2.0]), torch.tensor([[1.0, 0.7], [0.7, 1.0]])
).sample((1000,))
### Plot the data
plt.figure(figsize=(4, 4))
plt.scatter(X[:, 0], X[:, 1],  alpha=0.5)

Center the data

X_mean = X.mean(0, keepdim=True)
X_centered = X - X_mean

plt.scatter(X_centered[:, 0], X_centered[:, 1], alpha=0.5, label='Centered')
plt.scatter(X[:, 0], X[:, 1], alpha=0.5, label='Original')
plt.scatter(X_mean[:, 0], X_mean[:, 1], color='red', label='Original Mean')
plt.scatter(X_centered.mean(0, keepdim=True)[:, 0], X_centered.mean(0, keepdim=True)[:, 1], color='blue', label='Centered Mean')
plt.title('Centered vs Original')
plt.xlabel('X1')
plt.ylabel('X2')
plt.legend()

Finding covariance

cov = X_centered.T @ X_centered / (X.shape[0] - 1)
print('Covariance matrix:')
print(cov)
Covariance matrix:
tensor([[1.0229, 0.7291],
        [0.7291, 1.0733]])
### Finding the eigenvalues and eigenvectors
eigvals, eigvecs = torch.linalg.eigh(cov)
print('Eigenvalues:')
print(eigvals)
print('Eigenvectors:')
print(eigvecs)
Eigenvalues:
tensor([0.3186, 1.7776])
Eigenvectors:
tensor([[-0.7192,  0.6948],
        [ 0.6948,  0.7192]])
# Plot centered data with eigenvectors
plt.scatter(X_centered[:, 0], X_centered[:, 1], alpha=0.3)

# Plot eigenvectors starting from the mean
for i in range(2):
    vec = eigvecs[:, i]
    plt.quiver(0, 0, vec[0], vec[1], scale=3, color=f"C{i}", label=f"u{i+1} ({eigvals[i]:.2f})")

plt.axis('equal')
plt.legend()
plt.title("Centered Data with Principal Directions")
plt.show()

top_vec = eigvecs[:, -1]  # Last column of eigvecs (the top eigenvector)
print('Top eigenvector:')
print(top_vec)
Top eigenvector:
tensor([0.6948, 0.7192])
# Project centered data onto the top eigenvector using dot product
X_proj = torch.zeros_like(X_centered)  # Initialize an empty tensor to store projections

# Loop through each data point and project it onto the top eigenvector
for i in range(X_centered.shape[0]):
    # Calculate the projection of the i-th data point onto the top eigenvector
    X_proj[i] = torch.dot(X_centered[i], top_vec) * top_vec  # Scalar projection * eigenvector

# Reconstruct the data by adding the mean back
X_recon = X_proj + X_mean
X_recon
tensor([[ 2.8117, -4.2273],
        [ 4.4202, -2.5621],
        [ 5.5191, -1.4246],
        ...,
        [ 4.1834, -2.8073],
        [ 3.9091, -3.0913],
        [ 5.5841, -1.3573]])
plt.scatter(X[:, 0], X[:, 1], alpha=0.2, label='Original')
plt.scatter(X_recon[:, 0], X_recon[:, 1], alpha=0.8, label='PCA-1D')
plt.axis('equal')
plt.legend()
plt.title("PCA projection onto top component")
Text(0.5, 1.0, 'PCA projection onto top component')

# Load MNIST dataset
import torchvision
transform = transforms.Compose([transforms.ToTensor(), transforms.Lambda(lambda x: x.view(-1))])
train_data = torchvision.datasets.MNIST(root='~/.data', train=True, download=True, transform=transform)

# Take a small subset of data for simplicity
X = train_data.data[:1000].float()  # Take the first 1000 images (28x28 pixels)
y = train_data.targets[:1000]  # Corresponding labels
# View the first image
plt.imshow(X[0].reshape(28, 28), cmap='gray')

X = X / 255.0  # Normalize to [0, 1]
X_centered = X - X.mean(dim=0)  # Center the data by subtracting the mean
X_vecs = X_centered.reshape(X_centered.shape[0], -1)  # Reshape to (n_samples, n_features)
# Compute covariance matrix
cov_matrix = torch.cov(X_vecs.T)  # Transpose to get (n_features, n_samples)
print('Covariance matrix shape:', cov_matrix.shape)
Covariance matrix shape: torch.Size([784, 784])
plt.imshow(cov_matrix, cmap='hot', interpolation='nearest')
plt.colorbar()
plt.title('Covariance Matrix')
Text(0.5, 1.0, 'Covariance Matrix')

# Eigenvalue decomposition
eigvals, eigvecs = torch.linalg.eigh(cov_matrix)
print('Eigenvalues shape:', eigvals.shape)
Eigenvalues shape: torch.Size([784])
# Top K eigenvalues and eigenvectors
K = 10
top_k_eigvals, top_k_indices = torch.topk(eigvals, K)
top_k_eigvecs = eigvecs[:, top_k_indices]
print('Top K eigenvalues:', top_k_eigvals)
print('Top K eigenvectors shape:', top_k_eigvecs.shape)
# Plot the top K eigenvalues
plt.figure(figsize=(8, 4))
plt.bar(range(K), top_k_eigvals.numpy())
plt.xlabel('Eigenvalue Index')
plt.ylabel('Eigenvalue')
plt.title('Top K Eigenvalues')
plt.show()
Top K eigenvalues: tensor([5.1288, 4.0052, 3.5313, 2.8018, 2.5156, 2.3427, 1.8130, 1.5647, 1.4760,
        1.1167])
Top K eigenvectors shape: torch.Size([784, 10])

# Project data onto the top K eigenvectors
X_proj = torch.matmul(X_vecs, top_k_eigvecs)
print('Projected data shape:', X_proj.shape)
# Reconstruct the data from the top K components
Projected data shape: torch.Size([1000, 10])
import matplotlib.cm as cm

plt.figure(figsize=(5, 5))

colors = cm.tab10(np.arange(10))  # 10 distinct colors
for i in range(10):
    idx = (y == i)
    plt.scatter(X_proj[idx, 0], X_proj[idx, 1], 
                alpha=0.6, color=colors[i], label=f"{i}", s=10)

plt.title('PCA Projection onto Top 2 Components (MNIST)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.axis('equal')
plt.legend(title='Digit', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()