import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# Retina
%config InlineBackend.figure_format = 'retina'
KNN variants (tutorial)
ML
= np.array([[1, 9], [2, 3], [4, 1], [3, 7], [5, 4], [6, 8], [7, 2], [8, 8], [7, 9], [9, 6]])
X
= np.array([7, 4])
query_pt
def plot_dataset():
0], X[:, 1], s=100)
plt.scatter(X[:, 'X1')
plt.xlabel('X2')
plt.ylabel('equal', adjustable='box')
plt.gca().set_aspect(True, which='both', axis='both', linestyle='--', linewidth=1, alpha=0.5)
plt.grid(min(X[:, 0]), max(X[:, 0])+1, 1))
plt.xticks(np.arange(min(X[:, 1]), max(X[:, 1])+1, 1))
plt.yticks(np.arange(
0], query_pt[1], color='red', s=100)
plt.scatter(query_pt[
plot_dataset()
# Exact 1NN from sklearn
from sklearn.neighbors import NearestNeighbors
= 2
k = NearestNeighbors(n_neighbors=k, algorithm='brute').fit(X)
nbrs = nbrs.kneighbors([query_pt]) distances, indices
X[indices], distances
(array([[[5, 4],
[7, 2]]]),
array([[2., 2.]]))
def pairwise_dist_naive(x: np.ndarray, y: np.ndarray) -> float:
"""
x: numpy array of shape (d,)
y: numpy array of shape (d,)
Returns the Euclidean distance between x and y
"""
= len(x)
d assert d == len(y)
= 0.0
sqrd_distance for i in range(d):
+= (x[i] - y[i])**2
sqrd_distance return np.sqrt(sqrd_distance)
def pairwise_dist_numpy(x: np.ndarray, y: np.ndarray) -> float:
"""
x: numpy array of shape (d,)
y: numpy array of shape (d,)
Returns the Euclidean distance between x and y
"""
return np.sqrt(np.sum((x - y)**2))
def pairwise_dist_numpy_norm(x: np.ndarray, y: np.ndarray) -> float:
"""
x: numpy array of shape (d,)
y: numpy array of shape (d,)
Returns the Euclidean distance between x and y
"""
return np.linalg.norm(x - y)
0], X[1]), pairwise_dist_numpy(X[0], X[1]), pairwise_dist_numpy_norm(X[0], X[1]) pairwise_dist_naive(X[
(6.082762530298219, 6.082762530298219, 6.082762530298219)
def distance_vector(X: np.ndarray, query_pt: np.ndarray) -> np.ndarray:
"""
X: numpy array of shape (n, d)
query_pt: numpy array of shape (d,)
Returns the Euclidean distance between query_pt and each point in X
"""
= X.shape
n, d = np.zeros(n)
distances """Write logic here"""
return distances
# Test that the function is correct by comparing to sklearn
# Find all distances from query_pt to all points in X using sklearn
= NearestNeighbors(n_neighbors=len(X), algorithm='brute').fit(X)
nbrs = nbrs.kneighbors([query_pt])
distances_sklearn, idxs_sklearn print(distances_sklearn)
[[2. 2. 2.82842712 4.12310563 4.12310563 4.24264069
5. 5. 5.09901951 7.81024968]]
0, idxs_sklearn[0]] distances_sklearn[
array([4.12310563, 5. , 7.81024968, 4.24264069, 5. ,
2.82842712, 4.12310563, 5.09901951, 2. , 2. ])
import pandas as pd
= pd.DataFrame(X, columns=['X1', 'X2'])
df "query_distance"] = distances_sklearn[0, idxs_sklearn[0]]
df[ df
X1 | X2 | query_distance | |
---|---|---|---|
0 | 1 | 9 | 4.123106 |
1 | 2 | 3 | 5.000000 |
2 | 4 | 1 | 7.810250 |
3 | 3 | 7 | 4.242641 |
4 | 5 | 4 | 5.000000 |
5 | 6 | 8 | 2.828427 |
6 | 7 | 2 | 4.123106 |
7 | 8 | 8 | 5.099020 |
8 | 7 | 9 | 2.000000 |
9 | 9 | 6 | 2.000000 |
### LSH with Random Projections
### Random Projections
= 3
P 35)
np.random.seed(= np.random.randn(X.shape[1] + 1, P) # why +1? R
R
array([[-1.88973671, -0.41359218, -0.76602601],
[-0.92412667, -1.42159783, 0.80525599],
[ 1.14886176, 1.1694284 , -0.80200928]])
# For now, make R[:, 2] =1 to make it easier to plot
2] = 1
R[:, R
array([[-1.88973671, -0.41359218, 1. ],
[-0.92412667, -1.42159783, 1. ],
[ 1.14886176, 1.1694284 , 1. ]])
plot_dataset()
# Plot hyperplanes
for i in range(P):
= np.array([min(X[:, 0]), max(X[:, 0])])
x1 = (-R[0, i] - R[1, i]*x1) / R[2, i]
x2 =f'Hyperplane {i+1}')
plt.plot(x1, x2, label plt.legend()
= np.hstack([ np.ones((X.shape[0], 1)), X])
X_aug X_aug
array([[1., 1., 9.],
[1., 2., 3.],
[1., 4., 1.],
[1., 3., 7.],
[1., 5., 4.],
[1., 6., 8.],
[1., 7., 2.],
[1., 8., 8.],
[1., 7., 9.],
[1., 9., 6.]])
@ R X_aug
array([[ 7.52589247, 8.68966556, 11. ],
[-0.29140476, 0.25149734, 6. ],
[-4.43738162, -4.93055512, 6. ],
[ 3.37991562, 3.5076131 , 11. ],
[-1.914923 , -2.84386777, 10. ],
[ 1.75639738, 0.41224799, 15. ],
[-6.06089985, -8.02592023, 10. ],
[-0.09185596, -2.43094768, 17. ],
[ 1.98113247, 0.16007855, 17. ],
[-3.31370614, -6.19140231, 16. ]])
@ R) np.sign(X_aug
array([[ 1., 1., 1.],
[-1., 1., 1.],
[-1., -1., 1.],
[ 1., 1., 1.],
[-1., -1., 1.],
[ 1., 1., 1.],
[-1., -1., 1.],
[-1., -1., 1.],
[ 1., 1., 1.],
[-1., -1., 1.]])