KNN variants (tutorial)

Author

Nipun Batra

Published

April 5, 2024

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# Retina 
%config InlineBackend.figure_format = 'retina'

X = np.array([[1, 9], [2, 3], [4, 1], [3, 7], [5, 4], [6, 8], [7, 2], [8, 8], [7, 9], [9, 6]])

query_pt = np.array([7, 4])

def plot_dataset():
    plt.scatter(X[:, 0], X[:, 1], s=100)
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.gca().set_aspect('equal', adjustable='box')
    plt.grid(True, which='both', axis='both', linestyle='--', linewidth=1, alpha=0.5)
    plt.xticks(np.arange(min(X[:, 0]), max(X[:, 0])+1, 1))
    plt.yticks(np.arange(min(X[:, 1]), max(X[:, 1])+1, 1))
    
    plt.scatter(query_pt[0], query_pt[1], color='red', s=100)
    

plot_dataset()

# Exact 1NN from sklearn
from sklearn.neighbors import NearestNeighbors

k = 2
nbrs = NearestNeighbors(n_neighbors=k, algorithm='brute').fit(X)
distances, indices = nbrs.kneighbors([query_pt])


X[indices], distances

(array([[[5, 4],
         [7, 2]]]),
 array([[2., 2.]]))

def pairwise_dist_naive(x: np.ndarray, y: np.ndarray) -> float:
    """
    x: numpy array of shape (d,)
    y: numpy array of shape (d,)
    
    Returns the Euclidean distance between x and y
    """
    d = len(x)
    assert d == len(y)
    sqrd_distance = 0.0
    for i in range(d):
        sqrd_distance += (x[i] - y[i])**2
    return np.sqrt(sqrd_distance)

def pairwise_dist_numpy(x: np.ndarray, y: np.ndarray) -> float:
    """
    x: numpy array of shape (d,)
    y: numpy array of shape (d,)
    
    Returns the Euclidean distance between x and y
    """
    return np.sqrt(np.sum((x - y)**2))

def pairwise_dist_numpy_norm(x: np.ndarray, y: np.ndarray) -> float:
    """
    x: numpy array of shape (d,)
    y: numpy array of shape (d,)
    
    Returns the Euclidean distance between x and y
    """
    return np.linalg.norm(x - y)

pairwise_dist_naive(X[0], X[1]), pairwise_dist_numpy(X[0], X[1]), pairwise_dist_numpy_norm(X[0], X[1])

(6.082762530298219, 6.082762530298219, 6.082762530298219)

def distance_vector(X: np.ndarray, query_pt: np.ndarray) -> np.ndarray:
    """
    X: numpy array of shape (n, d)
    query_pt: numpy array of shape (d,)
    
    Returns the Euclidean distance between query_pt and each point in X
    """
    n, d = X.shape
    distances = np.zeros(n)
    """Write logic here"""
    
    return distances

# Test that the function is correct by comparing to sklearn

# Find all distances from query_pt to all points in X using sklearn
nbrs = NearestNeighbors(n_neighbors=len(X), algorithm='brute').fit(X)
distances_sklearn, idxs_sklearn = nbrs.kneighbors([query_pt])
print(distances_sklearn)

[[2.         2.         2.82842712 4.12310563 4.12310563 4.24264069
  5.         5.         5.09901951 7.81024968]]

distances_sklearn[0, idxs_sklearn[0]]

array([4.12310563, 5.        , 7.81024968, 4.24264069, 5.        ,
       2.82842712, 4.12310563, 5.09901951, 2.        , 2.        ])

import pandas as pd

df = pd.DataFrame(X, columns=['X1', 'X2'])
df["query_distance"] = distances_sklearn[0, idxs_sklearn[0]]
df

	X1	X2	query_distance
0	1	9	4.123106
1	2	3	5.000000
2	4	1	7.810250
3	3	7	4.242641
4	5	4	5.000000
5	6	8	2.828427
6	7	2	4.123106
7	8	8	5.099020
8	7	9	2.000000
9	9	6	2.000000

### LSH with Random Projections

### Random Projections

P = 3
np.random.seed(35)
R = np.random.randn(X.shape[1] + 1, P)  # why +1?

array([[-1.88973671, -0.41359218, -0.76602601],
       [-0.92412667, -1.42159783,  0.80525599],
       [ 1.14886176,  1.1694284 , -0.80200928]])

# For now, make R[:, 2] =1 to make it easier to plot
R[:, 2] = 1
R

array([[-1.88973671, -0.41359218,  1.        ],
       [-0.92412667, -1.42159783,  1.        ],
       [ 1.14886176,  1.1694284 ,  1.        ]])

plot_dataset()

# Plot hyperplanes
for i in range(P):
    x1 = np.array([min(X[:, 0]), max(X[:, 0])])
    x2 = (-R[0, i] - R[1, i]*x1) / R[2, i]
    plt.plot(x1, x2, label=f'Hyperplane {i+1}')
plt.legend()

X_aug = np.hstack([ np.ones((X.shape[0], 1)), X])
X_aug

array([[1., 1., 9.],
       [1., 2., 3.],
       [1., 4., 1.],
       [1., 3., 7.],
       [1., 5., 4.],
       [1., 6., 8.],
       [1., 7., 2.],
       [1., 8., 8.],
       [1., 7., 9.],
       [1., 9., 6.]])

X_aug @ R

array([[ 7.52589247,  8.68966556, 11.        ],
       [-0.29140476,  0.25149734,  6.        ],
       [-4.43738162, -4.93055512,  6.        ],
       [ 3.37991562,  3.5076131 , 11.        ],
       [-1.914923  , -2.84386777, 10.        ],
       [ 1.75639738,  0.41224799, 15.        ],
       [-6.06089985, -8.02592023, 10.        ],
       [-0.09185596, -2.43094768, 17.        ],
       [ 1.98113247,  0.16007855, 17.        ],
       [-3.31370614, -6.19140231, 16.        ]])

np.sign(X_aug @ R)

array([[ 1.,  1.,  1.],
       [-1.,  1.,  1.],
       [-1., -1.,  1.],
       [ 1.,  1.,  1.],
       [-1., -1.,  1.],
       [ 1.,  1.,  1.],
       [-1., -1.,  1.],
       [-1., -1.,  1.],
       [ 1.,  1.,  1.],
       [-1., -1.,  1.]])