import torch
import torch.nn as nn
import torch.distributions as dist
from torch.func import jacfwd, hessian
import tqdm
import numpy as np
import pandas as pd
from sklearn.datasets import make_blobs
from tueplots.bundles import beamer_moml
import matplotlib.pyplot as plt
# Retina display
%config InlineBackend.figure_format = 'retina'
# Use render mode to run the notebook and save the plots in beamer format
# Use interactive mode to run the notebook and show the plots in notebook-friendly format
= "render" # "interactive" or "render"
mode
if mode == "render":
= 0.6
width =width, rel_height=width * 0.8))
plt.rcParams.update(beamer_moml(rel_width# update marker size
"lines.markersize": 4})
plt.rcParams.update({"figure.facecolor"] = "none"
plt.rcParams[else:
plt.rcdefaults()
Bayesian Logistic Regression
Imports
def plt_show(name=None):
if mode == "interactive":
plt.show()elif mode == "render":
f"../figures/bayesian-logistic-regression/{name}.pdf")
plt.savefig(else:
raise ValueError(f"Unknown mode: {mode}")
Data
= make_blobs(
X_np, y_np =20, n_features=2, centers=2, random_state=11, cluster_std=1
n_samples
)= pd.DataFrame(dict(x1=X_np[:, 0], x2=X_np[:, 1], y=y_np))
df
4) df.head(
x1 | x2 | y | |
---|---|---|---|
0 | -5.973556 | -10.676098 | 0 |
1 | -0.439282 | 5.901450 | 1 |
2 | -0.972880 | 3.266332 | 1 |
3 | -5.298977 | -9.920072 | 0 |
= df[df.y == 1]
ones_df = df[df.y == 0]
zeros_df ="o", label="$y=1$", c="tab:blue")
plt.scatter(ones_df.x1, ones_df.x2, marker="x", label="$y=0$", c="tab:red")
plt.scatter(zeros_df.x1, zeros_df.x2, marker"$x_1$")
plt.xlabel("$x_2$")
plt.ylabel(=(1.3, 1))
plt.legend(bbox_to_anchor"data") plt_show(
if mode == "render":
print(df.head(4).style.format("{:.2f}").to_latex())
\begin{tabular}{lrrr}
& x1 & x2 & y \\
0 & -5.97 & -10.68 & 0.00 \\
1 & -0.44 & 5.90 & 1.00 \\
2 & -0.97 & 3.27 & 1.00 \\
3 & -5.30 & -9.92 & 0.00 \\
\end{tabular}
= map(lambda x: torch.tensor(x, dtype=torch.float32), (X_np, y_np))
X, y = y.reshape(-1, 1)
y X.shape, y.shape
(torch.Size([20, 2]), torch.Size([20, 1]))
MLE
def negative_log_likelihood(theta, y):
= torch.sigmoid(X @ theta)
probs # print(probs.shape, y.shape)
return -torch.sum(y * torch.log(probs) + (1 - y) * torch.log(1 - probs))
42)
torch.manual_seed(= torch.randn(2, 1, requires_grad=True)
theta = 1000
epochs
= torch.optim.Adam([theta], lr=0.01)
optimizer
= []
losses = tqdm.trange(epochs)
pbar for epoch in pbar:
optimizer.zero_grad()= negative_log_likelihood(theta, y)
loss
loss.backward()
optimizer.step()
losses.append(loss.item())f"loss: {loss.item():.2f}") pbar.set_description(
loss: 0.02: 100%|██████████| 1000/1000 [00:01<00:00, 886.72it/s]
plt.plot(losses)
Plot decision boundary
def plot_decision_boundary(name):
= torch.linspace(X[:, 0].min() - 1, X[:, 0].max() + 1, 20).reshape(-1, 1)
x0_grid = torch.linspace(X[:, 1].min() - 1, X[:, 1].max() + 1, 20).reshape(-1, 1)
x1_grid = torch.meshgrid(x0_grid.ravel(), x1_grid.ravel())
X0, X1
= lambda x1, x2: torch.sigmoid(theta[0] * x1 + theta[1] * x2).squeeze()
f = torch.vmap(torch.vmap(f))
f
with torch.no_grad():
= f(X0, X1)
probs
= plt.subplots(1, 2, gridspec_kw=dict(width_ratios=[1, 0.05]))
fig, ax 0].scatter(ones_df.x1, ones_df.x2, marker="o", label="$y=1$", c="tab:blue")
ax[0].scatter(zeros_df.x1, zeros_df.x2, marker="x", label="$y=0$", c="tab:red")
ax[= ax[0].contourf(
mappable =20, alpha=0.5, cmap="RdBu", vmin=0, vmax=1
X0, X1, probs, levels
)0].set_xlabel("$x_1$")
ax[0].set_ylabel("$x_2$")
ax[= "$\sigma^2$" + f" = {variance:.3f}, " if name.startswith("map") else ""
sigma_term 0].set_title(
ax[
sigma_term+ "$\\theta_1 = $"
+ f"{theta[0].item():.3f}, $\\theta_2 = $"
+ f"{theta[1].item():.3f}"
)= fig.colorbar(mappable, ticks=np.linspace(0, 1, 5), cax=ax[1])
cbar =(1.2, 1))
fig.legend(bbox_to_anchor
plt_show(name)
"mle") plot_decision_boundary(
MAP
def neg_log_prior(theta, variance):
return -dist.Normal(0, variance**0.5).log_prob(theta).sum()
def negative_log_joint(theta, variance):
return (
sum() + neg_log_prior(theta, variance)
negative_log_likelihood(theta, y). ).squeeze()
42)
torch.manual_seed(= 0.01
variance = torch.randn(2, 1, requires_grad=True)
theta = 500
epochs
= torch.optim.Adam([theta], lr=0.01)
optimizer
= []
losses = tqdm.trange(epochs)
pbar for epoch in pbar:
optimizer.zero_grad()= negative_log_joint(theta, variance).mean()
loss
loss.backward()
optimizer.step()
losses.append(loss.item())f"loss: {loss.item():.2f}") pbar.set_description(
loss: 3.78: 100%|██████████| 500/500 [00:00<00:00, 658.82it/s]
plt.plot(losses)
Plot decision boundary
f"map_{variance}") plot_decision_boundary(
Laplace approximation
with torch.no_grad():
= torch.sigmoid(X @ theta)
probs = hessian(negative_log_joint)(theta.ravel(), torch.tensor(variance))
hess
= torch.inverse(hess)
cov print(cov)
= dist.MultivariateNormal(theta.ravel(), cov) posterior
tensor([[ 0.0020, -0.0007],
[-0.0007, 0.0006]], grad_fn=<LinalgInvExBackward0>)
= f"map_laplace-{variance}"
name = torch.linspace(X[:, 0].min() - 1, X[:, 0].max() + 1, 20).reshape(-1, 1)
x0_grid = torch.linspace(X[:, 1].min() - 1, X[:, 1].max() + 1, 20).reshape(-1, 1)
x1_grid = torch.meshgrid(x0_grid.ravel(), x1_grid.ravel())
X0, X1
def get_probs(theta):
= lambda x1, x2: torch.sigmoid(theta[0] * x1 + theta[1] * x2).squeeze()
f = torch.vmap(torch.vmap(f))
f
with torch.no_grad():
= f(X0, X1)
probs return probs
= posterior.sample((1000,))
theta_samples = torch.stack([get_probs(theta) for theta in theta_samples])
probs print(probs.shape)
= probs.mean(0)
probs
= plt.subplots(1, 2, gridspec_kw=dict(width_ratios=[1, 0.05]))
fig, ax 0].scatter(ones_df.x1, ones_df.x2, marker="o", label="$y=1$", c="tab:blue")
ax[0].scatter(zeros_df.x1, zeros_df.x2, marker="x", label="$y=0$", c="tab:red")
ax[= ax[0].contourf(
mappable =20, alpha=0.5, cmap="RdBu", vmin=0, vmax=1
X0, X1, probs, levels
)0].set_xlabel("$x_1$")
ax[0].set_ylabel("$x_2$")
ax[= "$\sigma^2$" + f" = {variance:.3f}, " if name.startswith("map") else ""
sigma_term 0].set_title(
ax[
sigma_term+ "$\\theta_1 = $"
+ f"{theta[0].item():.3f}, $\\theta_2 = $"
+ f"{theta[1].item():.3f}"
)= fig.colorbar(mappable, ticks=np.linspace(0, 1, 5), cax=ax[1])
cbar =(1.2, 1))
fig.legend(bbox_to_anchor plt_show(name)
torch.Size([1000, 20, 20])
Appendix
Can we find closed form MLE solution for Bayesian Logistic Regression? It seems, yes. Stay tuned!