Nipun Batra Blog - Logistic Regression using PyTorch distributions

Basic Imports

import numpy as np
import matplotlib.pyplot as plt
import torch
import seaborn as sns
import pandas as pd

dist =torch.distributions

sns.reset_defaults()
sns.set_context(context="talk", font_scale=1)
%matplotlib inline
%config InlineBackend.figure_format='retina'

Generative model for logistic regression

x = dist.Normal(loc = torch.tensor([0., 0.]), scale=torch.tensor([1., 2.]))
x_sample = x.sample([100])
x_sample.shape

x_dash = torch.concat((torch.ones(x_sample.shape[0], 1), x_sample), axis=1)

theta = dist.MultivariateNormal(loc = torch.tensor([0., 0., 0.]), covariance_matrix=0.5*torch.eye(3))
theta_sample = theta.sample()

p = torch.sigmoid(x_dash@theta_sample)

y = dist.Bernoulli(probs=p)
y_sample = y.sample()

plt.scatter(x_sample[:, 0], x_sample[:, 1], c = y_sample, s=40, alpha=0.5)
sns.despine()

theta_sample

tensor([ 0.6368, -0.7526,  1.4652])

from sklearn.linear_model import LogisticRegression
lr_l2 = LogisticRegression()
lr_none = LogisticRegression(penalty='none')

lr_l2.fit(x_sample, y_sample)
lr_none.fit(x_sample, y_sample)

LogisticRegression(penalty='none')

def plot_fit(x_sample, y_sample, theta, model_name):

    
    # Retrieve the model parameters.
    b = theta[0]
    w1, w2 = theta[1], theta[2]
    # Calculate the intercept and gradient of the decision boundary.
    c = -b/w2
    m = -w1/w2

    # Plot the data and the classification with the decision boundary.
    xmin, xmax = x_sample[:, 0].min()-0.2, x_sample[:, 0].max()+0.2
    ymin, ymax =  x_sample[:, 1].min()-0.2, x_sample[:, 1].max()+0.2
    xd = np.array([xmin, xmax])
    yd = m*xd + c
    plt.plot(xd, yd, 'k', lw=1, ls='--')
    plt.fill_between(xd, yd, ymin, color='tab:blue', alpha=0.2)
    plt.fill_between(xd, yd, ymax, color='tab:orange', alpha=0.2)

    plt.scatter(*x_sample[y_sample==0].T, s=20, alpha=0.5)
    plt.scatter(*x_sample[y_sample==1].T, s=20, alpha=0.5)
    plt.xlim(xmin, xmax)
    plt.ylim(ymin, ymax)
    plt.ylabel(r'$x_2$')
    plt.xlabel(r'$x_1$')
    theta_print = np.round(theta, 1)
    plt.title(f"{model_name}\n{theta_print}")
    sns.despine()

plot_fit(
    x_sample,
    y_sample,
    theta_sample,
    r"Generating $\theta$",
)

plot_fit(
    x_sample,
    y_sample,
    np.concatenate((lr_l2.intercept_.reshape(-1, 1), lr_l2.coef_), axis=1).flatten(),
    r"Sklearn $\ell_2$ penalty ",
)

plot_fit(
    x_sample,
    y_sample,
    np.concatenate((lr_none.intercept_.reshape(-1, 1), lr_none.coef_), axis=1).flatten(),
    r"Sklearn No penalty ",
)

MLE estimate PyTorch

def neg_log_likelihood(theta, x, y):
    x_dash = torch.concat((torch.ones(x.shape[0], 1), x), axis=1)
    p = torch.sigmoid(x_dash@theta)
    y_dist = dist.Bernoulli(probs=p)

    return -torch.sum(y_dist.log_prob(y))

neg_log_likelihood(theta_sample, x_sample, y_sample)

tensor(33.1907)

theta_learn_loc = torch.tensor([1., 1., 1.], requires_grad=True)
neg_log_likelihood(theta_learn_loc, x_sample, y_sample)

plot_fit(
    x_sample,
    y_sample,
    theta_learn_loc.detach(),
    r"Torch without training",
)

theta_learn_loc = torch.tensor([0., 0., 0.], requires_grad=True)
loss_array = []
loc_array = []

opt = torch.optim.Adam([theta_learn_loc], lr=0.05)
for i in range(101):
    loss_val = neg_log_likelihood(theta_learn_loc, x_sample, y_sample)
    loss_val.backward()
    loc_array.append(theta_learn_loc)
    loss_array.append(loss_val.item())

    if i % 10 == 0:
        print(
            f"Iteration: {i}, Loss: {loss_val.item():0.2f}"
        )
    opt.step()
    opt.zero_grad()

Iteration: 0, Loss: 69.31
Iteration: 10, Loss: 44.14
Iteration: 20, Loss: 35.79
Iteration: 30, Loss: 32.73
Iteration: 40, Loss: 31.67
Iteration: 50, Loss: 31.25
Iteration: 60, Loss: 31.08
Iteration: 70, Loss: 31.00
Iteration: 80, Loss: 30.97
Iteration: 90, Loss: 30.95
Iteration: 100, Loss: 30.94

plot_fit(
    x_sample,
    y_sample,
    theta_learn_loc.detach(),
    r"Torch MLE",
)

MAP estimate PyTorch

prior_theta = dist.MultivariateNormal(loc = torch.tensor([0., 0., 0.]), covariance_matrix=2*torch.eye(3))

logprob = lambda theta: -prior_theta.log_prob(theta)

theta_learn_loc = torch.tensor([0., 0., 0.], requires_grad=True)
loss_array = []
loc_array = []

opt = torch.optim.Adam([theta_learn_loc], lr=0.05)
for i in range(101):
    loss_val = neg_log_likelihood(theta_learn_loc, x_sample, y_sample) + logprob(theta_learn_loc)
    loss_val.backward()
    loc_array.append(theta_learn_loc)
    loss_array.append(loss_val.item())

    if i % 10 == 0:
        print(
            f"Iteration: {i}, Loss: {loss_val.item():0.2f}"
        )
    opt.step()
    opt.zero_grad()

Iteration: 0, Loss: 73.11
Iteration: 10, Loss: 48.06
Iteration: 20, Loss: 39.89
Iteration: 30, Loss: 37.01
Iteration: 40, Loss: 36.10
Iteration: 50, Loss: 35.78
Iteration: 60, Loss: 35.67
Iteration: 70, Loss: 35.64
Iteration: 80, Loss: 35.62
Iteration: 90, Loss: 35.62
Iteration: 100, Loss: 35.62

plot_fit(
    x_sample,
    y_sample,
    theta_learn_loc.detach(),
    r"Torch MAP",
)

References

Plotting code borrwed from here: https://scipython.com/blog/plotting-the-decision-boundary-of-a-logistic-regression-model/