import numpy as np
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'
from latexify import latexify, format_axes
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
import pandas as pd
import ipywidgets as widgets
Bias Variance Tradeoff
ML
=2) latexify(columns
= np.linspace(0, 10, 50)
x_overall = 0.2*np.sin(x_overall) + 0.2*np.cos(2*x_overall)+ 0.6*x_overall - 0.05*x_overall**2 - 0.003*x_overall**3
f_x
= np.random.normal(0, 1, 50)
eps = f_x + eps
y_overall = 'True function')
plt.plot(x_overall, f_x, label =10, c='r', label = 'Noisy data')
plt.scatter(x_overall, y_overall, s
format_axes(plt.gca()) plt.legend()
def fit_plot_tree(x, y, depth=1, extra=None):
= DecisionTreeRegressor(max_depth=depth)
dt -1, 1), y)
dt.fit(x.reshape(= dt.predict(x.reshape(-1, 1))
y_pred
plt.figure()
= r'$f_{true}$', lw=2)
plt.plot(x_overall, f_x, label =10, c='r', label = 'Noisy data')
plt.scatter(x_overall, y_overall, s= r"$\hat{f}$" if not extra else fr"$\hat{{f}}_{{{extra}}}$"
label
= label, lw=2)
plt.plot(x, y_pred, label
format_axes(plt.gca())
plt.legend()f"Depth = {depth}")
plt.title(return dt
for i in range(1, 10):
fit_plot_tree(x_overall, y_overall, i)
def fit_plot_polynomial(x, y, degree=1, extra=None, ax=None):
= make_pipeline(PolynomialFeatures(degree), LinearRegression())
model -1, 1), y)
model.fit(x.reshape(= model.predict(x.reshape(-1, 1))
y_pred if ax is None:
= plt.subplots()
fig, ax
= r'$f_{true}$', lw=2)
ax.plot(x_overall, f_x, label =10, c='r', label = 'Noisy data')
ax.scatter(x_overall, y_overall, s= r"$\hat{f}$" if not extra else fr"$\hat{{f}}_{{{extra}}}$"
label
= label, lw=2)
ax.plot(x, y_pred, label
format_axes(ax)
ax.legend()f"Degree = {degree}")
ax.set_title(return model
5) fit_plot_polynomial(x_overall, y_overall,
Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=5)), ('linearregression', LinearRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=5)), ('linearregression', LinearRegression())])
PolynomialFeatures(degree=5)
LinearRegression()
def plot_degree(degree=1):
= []
regs = plt.subplots(5, 2, figsize=(8, 12), sharex=True, sharey=True)
fig, axes
for i, ax in enumerate(axes.flatten()):
= np.random.choice(np.arange(1, 49), 15, replace=False)
idx = np.concatenate([[0], idx, [49]])
idx
idx.sort()= x_overall[idx]
x = y_overall[idx]
y =degree, extra=i, ax=ax))
regs.append(fit_plot_polynomial(x, y, degree# remove legend
#ax.legend().remove()
=50, c='b', label='Sample', alpha=0.1)
ax.scatter(x_overall[idx], y_overall[idx], s
ax.legend()
plt.tight_layout()
plt.show()
return regs
= plot_degree(5) _
= {}
regs for i in range(0, 10):
= plot_degree(i) regs[i]
def plot_predictions(reg):
= np.linspace(0, 10, 50)
x_test = np.zeros((10, 50))
y_pred for i in range(10):
= reg[i].predict(x_test.reshape(-1, 1))
y_pred[i] =0), label = r'$\hat{f}$', lw=2)
plt.plot(x_test, y_pred.mean(axis= r'$f_{true}$', lw=2)
plt.plot(x_test, f_x, label =1, c='k', alpha=0.5)
plt.plot(x_test, y_pred.T, lw
format_axes(plt.gca())
plt.legend()
1]) plot_predictions(regs[
def plot_bias(reg):
= np.linspace(0, 10, 50)
x_test = np.zeros((10, 50))
y_pred for i in range(10):
= reg[i].predict(x_test.reshape(-1, 1))
y_pred[i] = np.mean(y_pred, axis=0)
y_pred_mean = np.var(y_pred, axis=0)
y_pred_var
= r'$f_{true}$', lw=2)
plt.plot(x_overall, f_x, label #plt.scatter(x_overall, y_overall, s=10, c='r', label = 'Noisy data')
= r'$\bar{f}$', lw=2)
plt.plot(x_test, y_pred_mean, label =0.2, color='green', label = 'Bias')
plt.fill_between(x_test, y_pred_mean, f_x, alpha
plt.legend()
7]) plot_bias(regs[
def plot_variance(reg):
= np.linspace(0, 10, 50)
x_test = np.zeros((10, 50))
y_pred for i in range(10):
= reg[i].predict(x_test.reshape(-1, 1))
y_pred[i] = np.mean(y_pred, axis=0)
y_pred_mean = np.var(y_pred, axis=0)
y_pred_var
= r'$f_{true}$', lw=2)
plt.plot(x_overall, f_x, label #plt.scatter(x_overall, y_overall, s=10, c='r', label = 'Noisy data')
= r'$\bar{f}$', lw=2)
plt.plot(x_test, y_pred_mean, label - y_pred_var, y_pred_mean + y_pred_var, alpha=0.2, color='red', label = 'Variance')
plt.fill_between(x_test, y_pred_mean plt.legend()
8]) plot_variance(regs[
# Plot bias^2 and variance for different depths as bar plot
def plot_bias_variance(reg):
= np.linspace(0, 10, 50)
x_test = np.zeros((10, 50))
y_pred for i in range(10):
= reg[i].predict(x_test.reshape(-1, 1))
y_pred[i] = np.mean(y_pred, axis=0)
y_pred_mean = np.var(y_pred, axis=0)
y_pred_var
= (y_pred_mean - f_x)**2
bias = y_pred_var
var return bias.sum(), var.sum()
= {}
bs = {}
vs for i in range(1, 8):
= plot_bias_variance(regs[i]) bs[i], vs[i]
= pd.DataFrame({'Bias': bs, 'Variance': vs}) df
=0) df.plot.bar(rot