CIFAR10 and CIFAR100 dataset

https://www.cs.toronto.edu/~kriz/cifar.html

Ref: https://hjweide.github.io/quantifying-uncertainty-in-neural-networks

Red objects: cars or apples?
Green objects: frogs? or apples?

Reference slides

https://classifier-calibration.github.io/assets/slides/clacal_tutorial_ecmlpkdd_2020_intro.pdf

First 7 slides from above link

import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# Retina display
%config InlineBackend.figure_format = 'retina'

from tueplots import bundles
plt.rcParams.update(bundles.beamer_moml())


# Also add despine to the bundle using rcParams
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False

# Increase font size to match Beamer template
plt.rcParams['font.size'] = 16
# Make background transparent
plt.rcParams['figure.facecolor'] = 'none'

from sklearn.datasets import make_moons
from sklearn.linear_model import LogisticRegression

X, y = make_moons(n_samples=1000, noise=0.1, random_state=0)

clf = LogisticRegression()

clf.fit(X, y)

LogisticRegression()

p_hats = clf.predict_proba(X)[:, 1]
p_hats[:10]

array([0.60804417, 0.96671272, 0.02867904, 0.41138351, 0.9631718 ,
       0.72298419, 0.51086972, 0.0148838 , 0.01696895, 0.98714042])

true_labels = y
true_labels[:10]

array([1, 1, 0, 1, 1, 1, 0, 0, 0, 1])

df = pd.DataFrame({'p_hat': p_hats, 'y': true_labels})
df.head()

	p_hat	y
0	0.608044	1
1	0.966713	1
2	0.028679	0
3	0.411384	1
4	0.963172	1

# Let us look at samples where p_hat is close to 0.5 (say between 0.45 and 0.55)
df_close_phat_val = df[(df.p_hat > 0.45) & (df.p_hat < 0.55)]
df_close_phat_val.head()

	p_hat	y
6	0.510870	0
15	0.481702	0
20	0.533697	0
41	0.522857	1
60	0.483962	1

len(df_close_phat_val)

# Histogram of p_hat values
df.p_hat.hist(bins=50)
plt.xlabel('$\hat{p}$')
plt.ylabel('Frequency')
plt.title('Histogram of $\hat{p}$ values');

findfont: Font family ['cursive'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'cursive' not found because none of the following families were found: Apple Chancery, Textile, Zapf Chancery, Sand, Script MT, Felipa, Comic Neue, Comic Sans MS, cursive

# Out of these len(df_close_phat_val) samples, how many are actually positive? 

df_close_phat_val.y.sum()

# Fraction of positive samples
df_close_phat_val.y.sum() / len(df_close_phat_val)

0.5806451612903226

# This is the reliability of the classifier at p_hat = 0.5

bins = np.linspace(0, 1, 11)
bins

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])

# Let us add a column to the dataframe that tells us which bin each sample belongs to

df['bin'] = pd.cut(df.p_hat, bins=bins)
df.head()

	p_hat	y	bin
0	0.608044	1	(0.6, 0.7]
1	0.966713	1	(0.9, 1.0]
2	0.028679	0	(0.0, 0.1]
3	0.411384	1	(0.4, 0.5]
4	0.963172	1	(0.9, 1.0]

# Let us compute the reliability of the classifier in each bin

df.groupby('bin').y.mean()

bin
(0.0, 0.1]    0.006231
(0.1, 0.2]    0.241379
(0.2, 0.3]    0.269231
(0.3, 0.4]    0.375000
(0.4, 0.5]    0.620690
(0.5, 0.6]    0.413793
(0.6, 0.7]    0.529412
(0.7, 0.8]    0.722222
(0.8, 0.9]    0.849315
(0.9, 1.0]    0.987097
Name: y, dtype: float64

# Calculate the reliability diagram points
mean_predicted_probs = df.groupby('bin').p_hat.mean()
fraction_of_positives = df.groupby('bin').y.mean()

bin_sizes = df.groupby('bin').size()
bin_sizes

bin
(0.0, 0.1]    321
(0.1, 0.2]     58
(0.2, 0.3]     52
(0.3, 0.4]     40
(0.4, 0.5]     29
(0.5, 0.6]     29
(0.6, 0.7]     34
(0.7, 0.8]     54
(0.8, 0.9]     73
(0.9, 1.0]    310
dtype: int64

ece = (np.abs(mean_predicted_probs - fraction_of_positives) * bin_sizes / len(df)).sum()
ece

0.036133146181597894

plt.figure(figsize=(6, 6))
plt.plot([0, 1], [0, 1], 'k:')
plt.plot(mean_predicted_probs, fraction_of_positives, 'o-')
plt.xlabel('Predicted probability')
plt.ylabel('Fraction of positives')
plt.title(f'Reliability Diagram (ECE={ece:.4f})')
plt.gca().set_aspect('equal')

# Plot the bar also
plt.bar(mean_predicted_probs, fraction_of_positives, width=0.1, alpha=0.1)

<BarContainer object of 10 artists>

from sklearn.calibration import calibration_curve

prob_true, prob_pred = calibration_curve(df['y'], df['p_hat'], n_bins=10, strategy='uniform')

assert np.allclose(prob_true, fraction_of_positives)

assert np.allclose(prob_pred, mean_predicted_probs)

pd.DataFrame({'prob_true': prob_true, 'prob_pred': prob_pred})

	prob_true	prob_pred
0	0.006231	0.030912
1	0.241379	0.149753
2	0.269231	0.244433
3	0.375000	0.341098
4	0.620690	0.447832
5	0.413793	0.550410
6	0.529412	0.650223
7	0.722222	0.753892
8	0.849315	0.854304
9	0.987097	0.970665

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve

def plot_reliability_diagram(classifier, X, y, n_bins=10, ax=None):
    """
    Generate a reliability diagram and calculate ECE for a classifier.

    Parameters:
    - classifier: A scikit-learn classifier with a `predict_proba` method.
    - X: Input features for the dataset.
    - y: True labels for the dataset.
    - n_bins: Number of bins for the reliability diagram.
    - ax: Matplotlib axes object to plot on.

    Returns:
    - ece: Expected Calibration Error (ECE).
    """
    if ax is None:
        fig, ax = plt.subplots()

    # Fit the classifier
    classifier.fit(X, y)

    # Predict probabilities
    p_hats = classifier.predict_proba(X)[:, 1]

    # Create a DataFrame
    df = pd.DataFrame({'p_hat': p_hats, 'y': y})

    # Create bins based on predicted probabilities
    bins = np.linspace(0, 1, n_bins + 1)
    df['bin'] = pd.cut(df['p_hat'], bins=bins)
    
    mean_predicted_probs = df.groupby('bin').p_hat.mean()
    fraction_of_positives = df.groupby('bin').y.mean()
    
    # Fill NANs with 0
    mean_predicted_probs = mean_predicted_probs.fillna(0)
    fraction_of_positives = fraction_of_positives.fillna(0)

    # Calculate the Expected Calibration Error (ECE) weighted by bin sizes
    bin_sizes = df.groupby('bin').size()
    
    # Ensure bin_sizes align with prob_true and prob_pred
    #bin_sizes = bin_sizes.reindex(range(n_bins), fill_value=0)
    #return bin_sizes, mean_predicted_probs, fraction_of_positives

    ece = (np.abs(mean_predicted_probs - fraction_of_positives) * bin_sizes / bin_sizes.sum()).sum()

    # Create the reliability diagram plot
    #plt.figure(figsize=(6, 6))
    ax.plot([0, 1], [0, 1], 'k:')
    ax.scatter(mean_predicted_probs, fraction_of_positives)
    # Plot bar also
    ax.bar(mean_predicted_probs, fraction_of_positives, width=0.1, alpha=0.1)
    ax.set_xlabel('Mean Predicted Probability')
    ax.set_ylabel('Fraction of Positives')
    ax.set_title(f'Calibration Curve (ECE={ece:.4f})')
    #plt.gca().set_aspect('equal')
    return ece

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.calibration import CalibrationDisplay

# Generate the dataset
X, y = make_moons(n_samples=1000, noise=0.1, random_state=0)

# List of classifiers to evaluate
classifiers = [
    ('Logistic Regression', LogisticRegression()),
    ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=0)),
    ('Support Vector Classifier', SVC(probability=True)),
    ('K-Nearest Neighbors', KNeighborsClassifier(n_neighbors=3))
]

fig, axarr = plt.subplots(figsize=(6, 6), nrows=len(classifiers), ncols=2)

# Loop through the classifiers and generate reliability diagrams
for i, (name, classifier) in enumerate(classifiers):
    ece = plot_reliability_diagram(classifier, X, y, ax=axarr[i, 0])
    print(f"Classifier: {name}")
    print(f"ECE: {ece:.4f}")


    
    CalibrationDisplay.from_estimator(
        classifier,
        X,
        y,
        n_bins=10,
        ax = axarr[i, 1],
        name=name
    )
    print("*"*20)
    print("*"*20)
    print("\n" + "=" * 50 + "\n")

Classifier: Logistic Regression
ECE: 0.0361
********************
********************

==================================================

Classifier: Random Forest
ECE: 0.0135
********************
********************

==================================================

Classifier: Support Vector Classifier
ECE: 0.0047
********************
********************

==================================================

Classifier: K-Nearest Neighbors
ECE: 0.0013
********************
********************

==================================================

Multiclass reliability diagram

Slide 26 from https://classifier-calibration.github.io/assets/slides/clacal_tutorial_ecmlpkdd_2020_intro.pdf

# Now, multi-class classification and reliability diagrams

from sklearn.datasets import make_classification, make_blobs

X, y = make_blobs(n_samples=1000, centers=3, n_features=2, random_state=0)

# Plot the dataset
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis')

<matplotlib.collections.PathCollection at 0x7f33e5367eb0>

clf = LogisticRegression()
clf.fit(X, y)

LogisticRegression()

preds = pd.DataFrame(clf.predict_proba(X))

preds

	0	1	2
0	0.068741	0.005454	0.925805
1	0.983384	0.015859	0.000757
2	0.021072	0.000240	0.978688
3	0.042703	0.001372	0.955925
4	0.000208	0.000026	0.999766
...	...	...	...
995	0.005001	0.994489	0.000510
996	0.050768	0.015765	0.933467
997	0.006151	0.911409	0.082440
998	0.000271	0.000021	0.999708
999	0.021539	0.962650	0.015811

1000 rows × 3 columns

preds['y'] = y

preds.head(10)

	0	1	2	y
0	0.068741	0.005454	0.925805	2
1	0.983384	0.015859	0.000757	0
2	0.021072	0.000240	0.978688	2
3	0.042703	0.001372	0.955925	2
4	0.000208	0.000026	0.999766	2
5	0.179771	0.797045	0.023183	1
6	0.120874	0.003821	0.875305	0
7	0.957242	0.000002	0.042756	0
8	0.000908	0.998876	0.000216	1
9	0.836772	0.162509	0.000718	0

# Syntax highlighting to show highest amongst the three columns
most_probable_class = preds[[0, 1, 2]].idxmax(axis=1)

# Add the most probable class to the DataFrame
preds['y_hat'] = most_probable_class

preds.head()

	0	1	2	y	y_hat
0	0.068741	0.005454	0.925805	2	2
1	0.983384	0.015859	0.000757	0	0
2	0.021072	0.000240	0.978688	2	2
3	0.042703	0.001372	0.955925	2	2
4	0.000208	0.000026	0.999766	2	2

# Let us simplify dataframe to only include the most probable class probability and whether the prediction was correct or not
new_df = preds.copy()
new_df["Correct"] = new_df.y == new_df.y_hat
new_df["Most_Probable_Class_Probability"] = new_df[[0, 1, 2]].max(axis=1)

new_df.head()

	0	1	2	y	y_hat	Correct	Most_Probable_Class_Probability
0	0.068741	0.005454	0.925805	2	2	True	0.925805
1	0.983384	0.015859	0.000757	0	0	True	0.983384
2	0.021072	0.000240	0.978688	2	2	True	0.978688
3	0.042703	0.001372	0.955925	2	2	True	0.955925
4	0.000208	0.000026	0.999766	2	2	True	0.999766

# drop the other columns
new_df = new_df.drop(columns=[0, 1, 2, 'y', 'y_hat'])
new_df.head()

	Correct	Most_Probable_Class_Probability
0	True	0.925805
1	True	0.983384
2	True	0.978688
3	True	0.955925
4	True	0.999766

# Let us look at samples where the most probable class probability is close to 0.5 (say between 0.45 and 0.55)
df_close_phat_val = new_df[(new_df.Most_Probable_Class_Probability > 0.45) & (new_df.Most_Probable_Class_Probability < 0.55)]
df_close_phat_val.head()

	Correct	Most_Probable_Class_Probability
27	True	0.532492
127	False	0.537886
135	True	0.536489
171	True	0.512435
203	False	0.534704

df_close_phat_val.Correct.sum() / len(df_close_phat_val)

0.4838709677419355

# Now, let us compute the reliability of the classifier in each bin
bins = np.linspace(0, 1, 11)

# Cuts the data by the bins
new_df['bin'] = pd.cut(new_df.Most_Probable_Class_Probability, bins=bins)

new_df.head(10)

	Correct	Most_Probable_Class_Probability	bin
0	True	0.925805	(0.9, 1.0]
1	True	0.983384	(0.9, 1.0]
2	True	0.978688	(0.9, 1.0]
3	True	0.955925	(0.9, 1.0]
4	True	0.999766	(0.9, 1.0]
5	True	0.797045	(0.7, 0.8]
6	False	0.875305	(0.8, 0.9]
7	True	0.957242	(0.9, 1.0]
8	True	0.998876	(0.9, 1.0]
9	True	0.836772	(0.8, 0.9]

# Calculate the reliability diagram points
mean_predicted_probs = new_df.groupby('bin').Most_Probable_Class_Probability.mean()
fraction_of_positives = new_df.groupby('bin').Correct.mean()

bin_sizes = new_df.groupby('bin').size()

ece = (np.abs(mean_predicted_probs - fraction_of_positives) * bin_sizes / len(new_df)).sum()

ece

0.016551445751129316

# Plot the reliability diagram
plt.figure(figsize=(6, 6))
plt.plot([0, 1], [0, 1], 'k:')
plt.plot(mean_predicted_probs, fraction_of_positives, 'o-')
plt.xlabel('Predicted probability')
plt.ylabel('Fraction of positives')
plt.title(f'Reliability Diagram (ECE={ece:.4f})')

Text(0.5, 1.0, 'Reliability Diagram (ECE=0.0166)')

# Histogram of p_hat values

new_df.Most_Probable_Class_Probability.hist(bins=50)
plt.xlim(0, 1)

(0.0, 1.0)

Reliability Diagrams for regression

https://proceedings.mlr.press/v80/kuleshov18a/kuleshov18a.pdf

# Now let us look at the above for regression with heteroscedastic aleatoric uncertainty

X = np.linspace(-3, 3, 1000).reshape(-1, 1)
y = np.sin(X.flatten()) + np.random.normal(0, 0.1, X.shape[0])

plt.scatter(X, y, s=10, alpha=0.5)

<matplotlib.collections.PathCollection at 0x7f351983e700>

# Use BayesianRidge for regression
from sklearn.linear_model import BayesianRidge

reg = BayesianRidge()
reg.fit(X, y)

BayesianRidge()

# Predict on linspace
X_test = np.linspace(-3, 3, 100).reshape(-1, 1)
y_pred, y_std = reg.predict(X_test, return_std=True)

# Plot the predictions
plt.plot(X_test, y_pred)
plt.fill_between(X_test.flatten(), y_pred - y_std, y_pred + y_std, alpha=0.5)
plt.scatter(X, y, alpha=0.1)

<matplotlib.collections.PathCollection at 0x7f350eb599a0>

# Within 1\sigma we would like to see 68% of the data points
# Within 2\sigma we would like to see 95% of the data points

# Let us see how many points are within k*sigma of the mean
k = 1
y_pred, y_std = reg.predict(X, return_std=True)
# Check how many points are within k*sigma of the mean
within_k_sigma = np.abs(y - y_pred) < k * y_std
# Fraction of points within k*sigma of the mean
within_k_sigma.mean()

0.585

def within_k_sigma(y, y_pred, y_std, k=1):
    """
    Return the fraction of points within k*sigma of the mean.
    """
    return np.abs(y - y_pred) < k * y_std

# Plot within_k_sigma for k=1, 2, 3, ..

ks = np.arange(1, 6)
within_k_sigma_fractions = [within_k_sigma(y, y_pred, y_std, k=k).mean() for k in ks]

plt.plot(ks, within_k_sigma_fractions, 'o-')
plt.xlabel('k')
plt.ylabel('Fraction of points within k*sigma of the mean')

Text(0, 0.5, 'Fraction of points within k*sigma of the mean')

# but, ideally we want 95% of the points to be within 2*sigma of the mean or within 95% CI
# we need a function to map CI to k

from scipy.stats import norm

def ci_to_k(ci):
    """
    Return the value of k corresponding to the given confidence interval.
    """
    return norm.ppf((1 + ci) / 2)

ci_to_k(0.95)

1.959963984540054

ci_to_k(0.68)

0.9944578832097535

# make reliability diagram for regression

percentiles = np.linspace(0, 100, 11)
quantiles = percentiles / 100

ks = ci_to_k(quantiles)

# Get fraction of points within k*sigma of the mean for each k
within_k_sigma_fractions = [within_k_sigma(y, y_pred, y_std, k=k).mean() for k in ks]

plt.plot(quantiles, within_k_sigma_fractions, 'o-')
# Plot ideal
plt.plot([0, 1], [0, 1], 'k:')