from tueplots import bundles
Linear Regression Tutorial
import numpy as np
import scipy.linalg
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams.update(bundles.icml2022())
1. Maximum Likelihood
Let us compute the maximum likelihood estimate for a given training set
## EDIT THIS FUNCTION
def max_lik_estimate(X, y):
# X: N x D matrix of training inputs
# y: N x 1 vector of training targets/observations
# returns: maximum likelihood parameters (D x 1)
= X.shape
N, D = np.linalg.solve(X.T @ X, X.T @ y) ## <-- SOLUTION
theta_ml return theta_ml
Now, make a prediction using the maximum likelihood estimate that we just found
## EDIT THIS FUNCTION
def predict_with_estimate(Xtest, theta):
# Xtest: K x D matrix of test inputs
# theta: D x 1 vector of parameters
# returns: prediction of f(Xtest); K x 1 vector
= Xtest @ theta ## <-- SOLUTION
prediction
return prediction
## EDIT THIS FUNCTION
def poly_features(X, K):
# X: inputs of size N x 1
# K: degree of the polynomial
# computes the feature matrix Phi (N x (K+1))
= X.flatten()
X = X.shape[0]
N
#initialize Phi
= np.zeros((N, K+1))
Phi
# Compute the feature matrix in stages
for k in range(K+1):
= X**k ## <-- SOLUTION
Phi[:,k] return Phi
## EDIT THIS FUNCTION
def nonlinear_features_maximum_likelihood(Phi, y):
# Phi: features matrix for training inputs. Size of N x D
# y: training targets. Size of N by 1
# returns: maximum likelihood estimator theta_ml. Size of D x 1
= 1e-08 # 'jitter' term; good for numerical stability
kappa
= Phi.shape[1]
D
# maximum likelihood estimate
= Phi.T @ y # Phi^T*y
Pt = Phi.T @ Phi + kappa*np.eye(D) # Phi^T*Phi + kappa*I
PP
# maximum likelihood estimate
= scipy.linalg.cho_factor(PP)
C = scipy.linalg.cho_solve(C, Pt) # inv(Phi^T*Phi)*Phi^T*y
theta_ml
return theta_ml
2. Maximum A Posteriori Estimation
## EDIT THIS FUNCTION
def map_estimate_poly(Phi, y, sigma, alpha):
# Phi: training inputs, Size of N x D
# y: training targets, Size of D x 1
# sigma: standard deviation of the noise
# alpha: standard deviation of the prior on the parameters
# returns: MAP estimate theta_map, Size of D x 1
= Phi.shape[1]
D
# SOLUTION
= Phi.T @ Phi + (sigma/alpha)**2 * np.eye(D)
PP = scipy.linalg.solve(PP, Phi.T @ y)
theta_map
return theta_map
# define the function we wish to estimate later
def g(x, sigma):
= np.hstack([x**0, x**1, np.sin(x)])
p = np.array([-1.0, 0.1, 1.0]).reshape(-1,1)
w return p @ w + sigma*np.random.normal(size=x.shape)
# Generate some data
= 1.0 # noise standard deviation
sigma = 1.0 # standard deviation of the parameter prior
alpha = 20
N
42)
np.random.seed(
= (np.random.rand(N)*10.0 - 5.0).reshape(-1,1)
X = g(X, sigma) # training targets
y
plt.figure()'+')
plt.plot(X, y, "$x$")
plt.xlabel("$y$")
plt.ylabel('data.pdf') plt.savefig(
# get the MAP estimate
= 8 # polynomial degree
K
# feature matrix
= poly_features(X, K)
Phi
= map_estimate_poly(Phi, y, sigma, alpha)
theta_map
# maximum likelihood estimate
= nonlinear_features_maximum_likelihood(Phi, y)
theta_ml
= np.linspace(-5,5,100).reshape(-1,1)
Xtest = g(Xtest, sigma)
ytest
= poly_features(Xtest, K)
Phi_test = Phi_test @ theta_map
y_pred_map
= Phi_test @ theta_ml
y_pred_mle
plt.figure()'+')
plt.plot(X, y,
plt.plot(Xtest, y_pred_map)
plt.plot(Xtest, y_pred_mle)
"data", r"prediction using $\theta_{MAP}$", r"prediction using $\theta_{MLE}$"], frameon = False)
plt.legend(['map_mle.pdf') plt.savefig(
print(np.hstack([theta_ml, theta_map]))
[[-1.49712990e+00 -1.08154986e+00]
[ 8.56868912e-01 6.09177023e-01]
[-1.28335730e-01 -3.62071208e-01]
[-7.75319509e-02 -3.70531732e-03]
[ 3.56425467e-02 7.43090617e-02]
[-4.11626749e-03 -1.03278646e-02]
[-2.48817783e-03 -4.89363010e-03]
[ 2.70146690e-04 4.24148554e-04]
[ 5.35996050e-05 1.03384719e-04]]
3. Bayesian Linear Regression
# Test inputs
= 200
Ntest = np.linspace(-5, 5, Ntest).reshape(-1,1) # test inputs
Xtest
= 2.0 # variance of the parameter prior (alpha^2). We assume this is known.
prior_var = 1.0 # noise variance (sigma^2). We assume this is known.
noise_var
= 3 # degree of the polynomial we consider at the moment pol_deg
## EDIT THIS CELL
# compute the feature matrix for the test inputs
= poly_features(Xtest, pol_deg) # N x (pol_deg+1) feature matrix SOLUTION
Phi_test
# compute the (marginal) prior at the test input locations
# prior mean
= np.zeros((Ntest,1)) # prior mean <-- SOLUTION
prior_mean
# prior variance
= Phi_test @ Phi_test.T * prior_var # N x N covariance matrix of all function values
full_covariance = np.diag(full_covariance)
prior_marginal_var
# Let us visualize the prior over functions
plt.figure()="k")
plt.plot(Xtest, prior_mean, color
= np.sqrt(prior_marginal_var).flatten()
conf_bound1 = 2.0*np.sqrt(prior_marginal_var).flatten()
conf_bound2 = 2.0*np.sqrt(prior_marginal_var + noise_var).flatten()
conf_bound3 + conf_bound1,
plt.fill_between(Xtest.flatten(), prior_mean.flatten() - conf_bound1, alpha = 0.1, color="k")
prior_mean.flatten() + conf_bound2,
plt.fill_between(Xtest.flatten(), prior_mean.flatten() - conf_bound2, alpha = 0.1, color="k")
prior_mean.flatten() + conf_bound3,
plt.fill_between(Xtest.flatten(), prior_mean.flatten() - conf_bound3, alpha = 0.1, color="k")
prior_mean.flatten()
'$x$')
plt.xlabel('$y$')
plt.ylabel("Prior over functions"); plt.title(
Now, we will use this prior distribution and sample functions from it.
## EDIT THIS CELL
# samples from the prior
= 10
num_samples
# We first need to generate random weights theta_i, which we sample from the parameter prior
= np.random.normal(size=(pol_deg+1,num_samples), scale=np.sqrt(prior_var))
random_weights
# Now, we compute the induced random functions, evaluated at the test input locations
# Every function sample is given as f_i = Phi * theta_i,
# where theta_i is a sample from the parameter prior
= Phi_test @ random_weights # <-- SOLUTION
sample_function
plt.figure()="r")
plt.plot(Xtest, sample_function, color"Plausible functions under the prior")
plt.title(print("Every sampled function is a polynomial of degree "+str(pol_deg));
Every sampled function is a polynomial of degree 3
Now we are given some training inputs \(\boldsymbol x_1, \dotsc, \boldsymbol x_N\), which we collect in a matrix \(\boldsymbol X = [\boldsymbol x_1, \dotsc, \boldsymbol x_N]^\top\in\mathbb{R}^{N\times D}\)
= 10
N = np.random.uniform(high=5, low=-5, size=(N,1)) # training inputs, size Nx1
X = g(X, np.sqrt(noise_var)) # training targets, size Nx1 y
Now, let us compute the posterior
## EDIT THIS FUNCTION
def polyfit(X, y, K, prior_var, noise_var):
# X: training inputs, size N x D
# y: training targets, size N x 1
# K: degree of polynomial we consider
# prior_var: prior variance of the parameter distribution
# sigma: noise variance
= 1e-08 # increases numerical stability
jitter
= poly_features(X, K) # N x (K+1) feature matrix
Phi
# Compute maximum likelihood estimate
= Phi.T @ y # Phi*y, size (K+1,1)
Pt = Phi.T @ Phi + jitter*np.eye(K+1) # size (K+1, K+1)
PP = scipy.linalg.cho_factor(PP)
C # maximum likelihood estimate
= scipy.linalg.cho_solve(C, Pt) # inv(Phi^T*Phi)*Phi^T*y, size (K+1,1)
theta_ml
# theta_ml = scipy.linalg.solve(PP, Pt) # inv(Phi^T*Phi)*Phi^T*y, size (K+1,1)
# MAP estimate
= scipy.linalg.solve(PP + noise_var/prior_var*np.eye(K+1), Pt)
theta_map
# parameter posterior
= (np.eye(K+1)/prior_var + PP/noise_var) # posterior precision
iSN = scipy.linalg.pinv(noise_var*np.eye(K+1)/prior_var + PP)*noise_var # posterior covariance
SN = scipy.linalg.solve(iSN, Pt/noise_var) # posterior mean
mN
return (theta_ml, theta_map, mN, SN)
= polyfit(X, y, pol_deg, alpha, sigma) theta_ml, theta_map, theta_mean, theta_var
Now, let’s make predictions (ignoring the measurement noise). We obtain three predictors: \[\begin{align} &\text{Maximum likelihood: }E[f(\boldsymbol X_{\text{test}})] = \boldsymbol \phi(X_{\text{test}})\boldsymbol \theta_{ml}\\ &\text{Maximum a posteriori: } E[f(\boldsymbol X_{\text{test}})] = \boldsymbol \phi(X_{\text{test}})\boldsymbol \theta_{map}\\ &\text{Bayesian: } p(f(\boldsymbol X_{\text{test}})) = \mathcal N(f(\boldsymbol X_{\text{test}}) \,|\, \boldsymbol \phi(X_{\text{test}}) \boldsymbol\theta_{\text{mean}},\, \boldsymbol\phi(X_{\text{test}}) \boldsymbol\theta_{\text{var}} \boldsymbol\phi(X_{\text{test}})^\top) \end{align}\] We already computed all quantities. Write some code that implements all three predictors.
## EDIT THIS CELL
# predictions (ignoring the measurement/observations noise)
= poly_features(Xtest, pol_deg) # N x (K+1)
Phi_test
# maximum likelihood predictions (just the mean)
= Phi_test @ theta_ml
m_mle_test
# MAP predictions (just the mean)
= Phi_test @ theta_map
m_map_test
# predictive distribution (Bayesian linear regression)
# mean prediction
= Phi_test @ theta_mean
mean_blr # variance prediction
= Phi_test @ theta_var @ Phi_test.T cov_blr
# plot the posterior
plt.figure()"+")
plt.plot(X, y, # plt.plot(Xtest, m_mle_test)
# plt.plot(Xtest, m_map_test)
= np.diag(cov_blr)
var_blr = np.sqrt(var_blr).flatten()
conf_bound1 = 2.0*np.sqrt(var_blr).flatten()
conf_bound2 = 2.0*np.sqrt(var_blr + sigma).flatten()
conf_bound3
+ conf_bound1,
plt.fill_between(Xtest.flatten(), mean_blr.flatten() - conf_bound1, alpha = 0.1, color="k")
mean_blr.flatten() # plt.fill_between(Xtest.flatten(), mean_blr.flatten() + conf_bound2,
# mean_blr.flatten() - conf_bound2, alpha = 0.1, color="k")
# plt.fill_between(Xtest.flatten(), mean_blr.flatten() + conf_bound3,
# mean_blr.flatten() - conf_bound3, alpha = 0.1, color="k")
"Training data", "BLR"], frameon = False)
plt.legend(['$x$')
plt.xlabel('$y$')
plt.ylabel(-7, 4)
plt.ylim('blr.pdf') plt.savefig(
# plot the posterior
plt.figure()"+")
plt.plot(X, y,
plt.plot(Xtest, m_mle_test)
plt.plot(Xtest, m_map_test)= np.diag(cov_blr)
var_blr = np.sqrt(var_blr).flatten()
conf_bound1 = 2.0*np.sqrt(var_blr).flatten()
conf_bound2 = 2.0*np.sqrt(var_blr + sigma).flatten()
conf_bound3
+ conf_bound1,
plt.fill_between(Xtest.flatten(), mean_blr.flatten() - conf_bound1, alpha = 0.1, color="k")
mean_blr.flatten() + conf_bound2,
plt.fill_between(Xtest.flatten(), mean_blr.flatten() - conf_bound2, alpha = 0.1, color="k")
mean_blr.flatten() + conf_bound3,
plt.fill_between(Xtest.flatten(), mean_blr.flatten() - conf_bound3, alpha = 0.1, color="k")
mean_blr.flatten() "Training data", "MLE", "MAP", "BLR"], frameon = False)
plt.legend(['$x$')
plt.xlabel('$y$')
plt.ylabel(-7, 4)
plt.ylim('blr_mle_map.pdf') plt.savefig(