import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
Curse of Dimensionality
Curse of Dimensionality
#
= 10
n 42)
np.random.seed(# Get `n` points in 1d space uniformly distributed from 0 to 1
= np.random.uniform(0, 1, n)
x ='r', s=100) plt.scatter(x, np.zeros(n), c
<matplotlib.collections.PathCollection at 0x7f55f7e71d60>
# Pick a random test point
= np.random.uniform(0, 1, 1)
x_test
# Mark the nearest point and farthest point
= x[np.argmin(np.abs(x - x_test))]
x_nearest = x[np.argmax(np.abs(x - x_test))]
x_farthest
='r', s=100)
plt.scatter(x, np.zeros(n), c0, c='b', s=100, marker='*', label='test')
plt.scatter(x_test, 0, c='g', s=100, label='nearest')
plt.scatter(x_nearest, 0, c='y', s=100, label='farthest')
plt.scatter(x_farthest,
plt.legend()
= np.abs(x_test - x_nearest) / np.abs(x_test - x_farthest)
ratio print('Ratio of distances: {}'.format(ratio))
Ratio of distances: [0.040316]
# Do the above experiment for 1000 times
= 10
n 42)
np.random.seed(
= 1000
n_exp = np.zeros(n_exp)
ratios for i in range(n_exp):
= np.random.uniform(0, 1, n)
x = np.random.uniform(0, 1, 1)
x_test = x[np.argmin(np.abs(x - x_test))]
x_nearest = x[np.argmax(np.abs(x - x_test))]
x_farthest = np.abs(x_test - x_nearest) / np.abs(x_test - x_farthest) ratios[i]
import seaborn as sns
=False, bins=20) sns.displot(ratios, kde
# Repeat the experiment in 2d
= 10
n 42)
np.random.seed(= np.random.uniform(0, 1, (n, 2))
x 0], x[:, 1], c='r', s=100) plt.scatter(x[:,
<matplotlib.collections.PathCollection at 0x7f53d91c9a60>
# Pick a random test point
= np.random.uniform(0, 1, 2)
x_test
# Mark the nearest point and farthest point
= x[np.argmin(np.linalg.norm(x - x_test, axis=1))]
x_nearest = x[np.argmax(np.linalg.norm(x - x_test, axis=1))]
x_farthest
0], x[:, 1], c='r', s=100)
plt.scatter(x[:, 0], x_test[1], c='b', s=100, marker='*', label='test')
plt.scatter(x_test[0], x_nearest[1], c='g', s=100, label='nearest')
plt.scatter(x_nearest[0], x_farthest[1], c='y', s=100, label='farthest')
plt.scatter(x_farthest[ plt.legend()
<matplotlib.legend.Legend at 0x7f53d91456a0>
# Find the ratio of distances between the nearest and farthest points in 1000 experiments
= 10
n 42)
np.random.seed(
= np.zeros(n_exp)
ratios_2d for i in range(n_exp):
= np.random.uniform(0, 1, (n, 2))
x = np.random.uniform(0, 1, 2)
x_test = x[np.argmin(np.linalg.norm(x - x_test, axis=1))]
x_nearest = x[np.argmax(np.linalg.norm(x - x_test, axis=1))]
x_farthest = np.linalg.norm(x_test - x_nearest) / np.linalg.norm(x_test - x_farthest) ratios_2d[i]
=False, bins=20) sns.displot(ratios_2d, kde
# Now, let's do the same experiment in dimensions varying from 1 to 20
= 10
n 42)
np.random.seed(= 40
n_dim = np.zeros((n_exp, n_dim))
ratios_nd for i in range(n_exp):
for d in range(1, n_dim + 1):
= np.random.uniform(0, 1, (n, d))
x = np.random.uniform(0, 1, d)
x_test = x[np.argmin(np.linalg.norm(x - x_test, axis=1))]
x_nearest = x[np.argmax(np.linalg.norm(x - x_test, axis=1))]
x_farthest - 1] = np.linalg.norm(x_test - x_nearest) / np.linalg.norm(x_test - x_farthest) ratios_nd[i, d
# Plot the ratio of distances between the nearest and farthest points in 1000 experiments for each dimension
=(10, 6))
plt.figure(figsize1, n_dim + 1), np.mean(ratios_nd, axis=0), 'o-')
plt.plot(np.arange('Dimension')
plt.xlabel('Ratio of distances')
plt.ylabel('Ratio of distances between the nearest and farthest points in 1000 experiments for each dimension')
plt.title(0, 1.) plt.ylim(
(0.0, 1.0)
# Let us now see what happens if we have more points in higher dimensions
# 1d space: 10 points
= 10
n 42)
np.random.seed(= 4
n_dim = np.zeros((n_exp, n_dim))
ratios_nd_more_points = [10, 50, 200, 2000]
num_points for i in range(n_exp):
for d in range(1, n_dim + 1):
= np.random.uniform(0, 1, (num_points[d-1], d))
x = np.random.uniform(0, 1, d)
x_test = x[np.argmin(np.linalg.norm(x - x_test, axis=1))]
x_nearest = x[np.argmax(np.linalg.norm(x - x_test, axis=1))]
x_farthest - 1] = np.linalg.norm(x_test - x_nearest) / np.linalg.norm(x_test - x_farthest) ratios_nd_more_points[i, d
# Plot the ratio of distances between the nearest and farthest points in 1000 experiments for each dimension
=(10, 6))
plt.figure(figsize1, n_dim + 1), np.mean(ratios_nd_more_points, axis=0), 'o-')
plt.plot(np.arange('Dimension')
plt.xlabel('Ratio of distances')
plt.ylabel('Ratio of distances between the nearest and farthest points in 1000 experiments for each dimension')
plt.title(0, 1) plt.ylim(
(0.0, 1.0)
# Now showing how linear regression is affected by curse of dimensionality
= 20
n_points
= np.random.uniform(0, 1, (n_points, 1))
x # sort the x values
= np.sort(x, axis=0)
x = 4 * x + 3 + np.random.normal(0, 0.5, (n_points, 1))
y
plt.scatter(x, y)
<matplotlib.collections.PathCollection at 0x7f53d8f5feb0>
# Let us fit a linear regression model of degree d
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
def fit_linear_regression(x, y, d):
= PolynomialFeatures(degree=d)
poly = poly.fit_transform(x)
x = LinearRegression()
model
model.fit(x, y)return model
def plot_linear_regression(x, y, d):
= fit_linear_regression(x, y, d)
model = model.predict(PolynomialFeatures(degree=d).fit_transform(x))
y_pred
plt.scatter(x, y)='r')
plt.plot(x, y_pred, c plt.show()
1) plot_linear_regression(x, y,
5) plot_linear_regression(x, y,
10) plot_linear_regression(x, y,
15) plot_linear_regression(x, y,
# Now, we see that if we increase the number of points, the model will fit better
= 1000
n_points
= np.random.uniform(0, 1, (n_points, 1))
x # sort the x values
= np.sort(x, axis=0)
x = 4 * x + 3 + np.random.normal(0, 0.5, (n_points, 1))
y
1) plot_linear_regression(x, y,
5) plot_linear_regression(x, y,
25) plot_linear_regression(x, y,