import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'KNN LSH
    KNN LSH
  
# Generate some data
X = np.random.randn(3000, 2)
# Plot the data
plt.scatter(X[:, 0], X[:, 1], s=30)<matplotlib.collections.PathCollection at 0x7f321da32c10>
# Naive KNN
def naive_knn_for_loop(X, x_test, k=3):
    dists = np.zeros(X.shape[0])
    for i in range(X.shape[0]): # N iterations (N = number of data points)
        dists[i] = np.dot(X[i] - x_test, X[i] - x_test) # Time complexity: O(D)
    
    # Time complexity to create the distance array: O(N*D)
    # Now, we need to find the k smallest distances
    return np.argpartition(dists, k)[:k] # Time complexity: O(Nk) or O(N) depending on the implementation
  naive_knn_for_loop(X, np.array([0, 0]))array([2529,  958,  804])X[naive_knn_for_loop(X, np.array([0, 0]))]array([[-0.02103967,  0.02703294],
       [ 0.0092843 ,  0.02548091],
       [-0.03094897,  0.01750535]])%timeit naive_knn_for_loop(X, np.array([0, 0]))12.3 ms ± 47.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)# Implement using numpy
def naive_knn_numpy(X, x_test, k=3):
    dists = np.sum((X - x_test)**2, axis=1)
    #return np.partition(dists, k)[:k]
    sorted_dists = np.argsort(dists)
    return sorted_dists[:k]naive_knn_numpy(X, np.array([0, 0]))array([ 958, 2529,  804])%timeit naive_knn_numpy(X, np.array([0, 0]))240 µs ± 631 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)# Implement using numpy
def naive_knn_numpy(X, x_test, k=3):
    dists = np.sum((X - x_test)**2, axis=1)
    return np.argpartition(dists, k)[:k]
    #sorted_dists = np.argsort(dists)
    #return sorted_dists[:k]%timeit naive_knn_numpy(X, np.array([0, 0]))84.6 µs ± 607 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)# Show LSH implementation step by step
# Creating a random separating hyperplane
w = np.random.randn(2)
b = np.random.randn(1)/4.0
# Plot the separating hyperplane
x = np.linspace(-3, 3, 100)
y = -(w[0] * x + b) / w[1]
plt.scatter(X[:, 0], X[:, 1], s=30)
plt.plot(x, y, 'r', linewidth=3)
# Color the points based on which side of the hyperplane they are on
colors = X[:, 0]*w[0] + X[:, 1]*w[1] + b > 0
plt.scatter(X[:, 0], X[:, 1], s=30, c=colors)<matplotlib.collections.PathCollection at 0x7f31106b4ac0>
# Create three random hyperplanes and color the points based on which side of the hyperplane they are on. 
# there should be 2^3 = 8 different colors
# each separating hyperplane corresponds to a bit in the hash
hash = np.zeros((X.shape[0], 3)).astype(int)
ws = []
bs = []
# Cost for creating the hash table: O(N*H*D) 
for H in range(3): # H = number of hyperplanes
    w = np.random.randn(2)
    b = np.random.randn(1)/4.0
    ws.append(w)
    bs.append(b)
    hash[:, H] = X[:, 0]*w[0] + X[:, 1]*w[1] + b > 0 # D computations per iteration
# Convert the hash to a decimal number
hash_dec = np.sum(hash * 2**np.arange(3)[::-1], axis=1)
# Plot the hash
plt.scatter(X[:, 0], X[:, 1], s=30, c=hash)
# Plot the hash with the separating hyperplanes
plt.scatter(X[:, 0], X[:, 1], s=30, c=hash_dec)
for w, b in zip(ws, bs):
    print(w, b)
    x = np.linspace(-3, 3, 100)
    y = -(w[0] * x + b) / w[1]
    plt.plot(x, y, 'r')
# Mark the test point
x_test = np.array([0, 0])
plt.scatter(x_test[0], x_test[1], s=100, c='r')[-1.78798897 -1.3408181 ] [-0.08094113]
[ 0.9447324  -2.47059549] [0.09350769]
[0.20531227 0.97521902] [-0.22471283]<matplotlib.collections.PathCollection at 0x7f31105bf280>
df = pd.DataFrame(hash)
df.columns = ['h1', 'h2', 'h3']
df['hash_dec'] = hash_dec
df['x'] = X[:, 0]
df['y'] = X[:, 1]
df.head(10)| h1 | h2 | h3 | hash_dec | x | y | |
|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 0 | 6 | -1.289013 | -0.497073 | 
| 1 | 1 | 1 | 0 | 6 | 0.721631 | -1.923390 | 
| 2 | 1 | 1 | 0 | 6 | 0.042595 | -0.177549 | 
| 3 | 1 | 1 | 0 | 6 | 0.148706 | -0.452442 | 
| 4 | 1 | 1 | 0 | 6 | -0.047372 | -0.431685 | 
| 5 | 1 | 1 | 0 | 6 | -0.478764 | -0.304759 | 
| 6 | 0 | 1 | 0 | 2 | 0.812057 | -0.574337 | 
| 7 | 1 | 0 | 1 | 5 | -1.493164 | 1.209339 | 
| 8 | 0 | 1 | 0 | 2 | 0.820065 | -0.575965 | 
| 9 | 0 | 0 | 1 | 1 | 1.045276 | 1.143788 | 
pd.DataFrame(hash).value_counts()0  1  2
1  1  0    846
0  0  1    827
   1  0    491
1  0  0    346
      1    243
0  1  1    210
   0  0     37
dtype: int64# Predict the K nearest neighbors using LSH
# Compute the hash for the test point
x_test = np.array([0, 0])
hash_test = x_test[0]*ws[0][0] + x_test[1]*ws[0][1] + bs[0] > 0
#convert to decimal
hash_test_dec = np.sum(hash_test * 2**np.arange(3)[::-1])
hash_test_dec0# Find subset of points with the same hash
X_subset = X[hash_dec == hash_test_dec]
X_subset.shape(37, 2)# Now, we can use the naive KNN implementation to find the K nearest neighbors
ix = naive_knn_numpy(X_subset, x_test, k=3)
X_subset[ix]array([[-0.04090763,  0.07013394],
       [-0.00419256,  0.08614131],
       [-0.05284791,  0.06786371]])%timeit naive_knn_numpy(X_subset, x_test, k=3)10.5 µs ± 31.4 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)# Using FAISS from Facebook
import faiss
# Create an index
index = faiss.IndexFlatL2(2)   # build the index
# Add the data to the index
index.add(X.astype(np.float32))                  # add vectors to the index# Search for the K nearest neighbors
D, I = index.search(x_test.astype(np.float32).reshape(1, -1), k=3)     # actual searchDarray([[0.00073547, 0.00117345, 0.00126428]], dtype=float32)Iarray([[ 958, 2529,  804]])X[I[0]]array([[ 0.0092843 ,  0.02548091],
       [-0.02103967,  0.02703294],
       [-0.03094897,  0.01750535]])%timeit index.search(x_test.astype(np.float32).reshape(1, -1), k=3)50.9 µs ± 212 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)# Now, run on GPU
res = faiss.StandardGpuResources()  # use a single GPU
# Create an index
index = faiss.IndexFlatL2(2)   # build the index
gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)
# Add the data to the index
gpu_index_flat.add(X.astype(np.float32))                  # add vectors to the index%timeit gpu_index_flat.search(x_test.astype(np.float32).reshape(1, -1), k=3)79.8 µs ± 674 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)# The above is slow because
# 1. We are copying the data to the GPU
# 2. We are copying the data back to the CPU
# 3. Not enough data and low dimensional data