Coin Toss (MLE, MAP, Fully Bayesian) in TF Probability
Goals
We will be studying the problem of coin tosses. I will not go into derivations but mostly deal with automatic gradient computation in TF Probability.
We have the following goals in this tutorial.
Goal 1: Maximum Likelihood Estimate (MLE)
Given a set of N observations, estimate the probability of H (denoted as $\theta = p(H)$)
Goal 2: Maximum A-Posteriori (MAP)
Given a set of N observations and some prior knowledge on the distribution of $\theta$, estimate the best point estimate of $\theta$ once we have observed the dataset.
Goal 3: Fully Bayesian
Given a set of N observations and some prior knowledge on the distribution of $\theta$, estimate the distribution of $\theta$ once we have observed the dataset.
While I mention all the references below, I acknowledge Felix and his excellent repo and video playlist (Playlist 1, Playlist 2). They inspired me to create this post.
from silence_tensorflow import silence_tensorflow
silence_tensorflow()
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import functools
import seaborn as sns
import tensorflow_probability as tfp
import pandas as pd
tfd = tfp.distributions
tfl = tfp.layers
tfb = tfp.bijectors
sns.reset_defaults()
sns.set_context(context="talk", font_scale=1)
%matplotlib inline
%config InlineBackend.figure_format='retina'
np.random.seed(0)
tf.random.set_seed(0)
distribution = tfd.Bernoulli(probs=0.75)
dataset_10 = distribution.sample(10)
print(dataset_10.numpy())
mle_estimate_10 = tf.reduce_mean(tf.cast(dataset_10, tf.float32))
tf.print(mle_estimate_10)
MLE
Obtaining MLE analytically
From the above 10 samples, we obtain 6 Heads (1) and 4 Tails. As per the principal of MLE, the best estimate for $\theta = p(H) = \dfrac{n_h}{n_h+n_t} = 0.6$
We may also notice that the value of 0.6 is far from the 0.75 value we had initially set. This is possible as our dataset is small.
We will now verify if we get the same result using TFP. But, first, we can create a graphical model for our problem.
import daft
pgm = daft.PGM([4, 3], origin=[0, 0])
pgm.add_node(daft.Node("theta", r"$\theta$", 1, 2.5, aspect=1.8))
pgm.add_node(daft.Node("obs", r"$obs_i$", 1, 1, aspect=1.2, observed=True))
pgm.add_edge("theta", "obs")
pgm.add_plate([0, 0.5, 2, 1.0], label=r"$N$", shift=-0.1)
pgm.render()
dataset_large = distribution.sample(100000)
mle_estimate = {}
for dataset_size in [10, 50, 100, 500, 1000, 10000, 100000]:
mle_estimate[dataset_size] = tf.reduce_mean(
tf.cast(dataset_large[:dataset_size], tf.float32)
)
tf.print(mle_estimate)
As we can see above, when we use larger dataset sizes, our estimate matches the value we set (0.75).
theta = tf.Variable(0.1)
fit = tfd.Bernoulli(probs=theta)
fit.log_prob(dataset_10)
dataset = dataset_10
def loss():
return -tf.reduce_sum(fit.log_prob(dataset))
trace_fn = lambda traceable_quantities: {
"loss": traceable_quantities.loss,
"theta": theta,
}
num_steps = 150
trace = tfp.math.minimize(
loss_fn=loss,
num_steps=num_steps,
optimizer=tf.optimizers.Adam(learning_rate=0.01),
trace_fn=trace_fn,
)
theta
fig, ax = plt.subplots(nrows=2, sharex=True, figsize=(6, 4))
ax[0].plot(range(num_steps), trace["loss"])
ax[1].plot(range(num_steps), trace["theta"])
sns.despine()
ax[1].set_xlabel("Iterations")
ax[0].set_ylabel("Loss")
ax[1].set_ylabel(r"$\theta$")
fig.tight_layout()
From the above calculations, we can see that we have obtained the same estimate of ~0.6 using TFP.
@tf.function
def loss_and_grads(fit):
with tf.GradientTape() as tape:
loss = -tf.reduce_sum(fit.log_prob(dataset))
return loss, tape.gradient(loss, fit.trainable_variables)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
theta = tf.Variable(0.1)
fit = tfd.Bernoulli(probs=theta)
for i in range(num_steps):
loss, grads = loss_and_grads(fit)
optimizer.apply_gradients(zip(grads, fit.trainable_variables))
fit.trainable_variables
We can see that we obtain the same estimate.
pgm = daft.PGM([4, 4], origin=[0, 0])
pgm.add_node(daft.Node("alpha", r"$\alpha$", 0.5, 3.5, aspect=1.8))
pgm.add_node(daft.Node("beta", r"$\beta$", 1.5, 3.5, aspect=1.8))
pgm.add_node(daft.Node("theta", r"$\theta$", 1, 2.5, aspect=2))
# pgm.add_node(daft.Node("theta", r"$\theta\sim Beta (\alpha, \beta)$", 1, 2.5, aspect=4))
pgm.add_node(daft.Node("obs", r"$obs_i$", 1, 1, aspect=1.2, observed=True))
pgm.add_edge("theta", "obs")
pgm.add_edge("alpha", "theta")
pgm.add_edge("beta", "theta")
pgm.add_plate([0, 0.5, 2, 1.0], label=r"$N$", shift=-0.1)
pgm.render()
def coin_toss_uniform_model():
theta = yield tfp.distributions.Uniform(low=0.0, high=1.0, name="Theta")
coin = yield tfp.distributions.Bernoulli(probs=tf.ones(100) * theta, name="Coin")
coin_toss_uniform_model
model_joint_uniform = tfp.distributions.JointDistributionCoroutineAutoBatched(
lambda: coin_toss_uniform_model(), name="Original"
)
model_joint_uniform
def uniform_model(dataset):
num_datapoints = len(dataset)
theta = yield tfp.distributions.Uniform(low=0.0, high=1.0, name="Theta")
coin = yield tfp.distributions.Bernoulli(
probs=tf.ones(num_datapoints) * theta, name="Coin"
)
concrete_uniform_model = functools.partial(uniform_model, dataset=dataset_10)
model = tfd.JointDistributionCoroutineAutoBatched(concrete_uniform_model)
model.sample()
th = tf.Variable(0.4)
target_log_prob_fn = lambda th: model.log_prob((th, dataset_10))
x_s = tf.linspace(0.0, 1.0, 1000)
y_s = -target_log_prob_fn(x_s)
plt.plot(x_s, y_s)
plt.xlabel(r"$\theta$")
plt.ylabel("- Joint Log Prob \n(Unnormalized)")
sns.despine()
trace = tfp.math.minimize(
lambda: -target_log_prob_fn(th),
optimizer=tf.optimizers.Adam(learning_rate=0.01),
# trace_fn=trace_fn,
num_steps=200,
)
th
mle_estimate_10
We see above that our MAP estimate is fairly close to the MLE when we used the uniform prior.
def beta_prior_model(dataset, alpha, beta):
num_datapoints = len(dataset)
theta = yield tfp.distributions.Beta(
concentration0=alpha, concentration1=beta, name="Theta"
)
coin = yield tfp.distributions.Bernoulli(
probs=tf.ones(num_datapoints) * theta, name="Coin"
)
concrete_beta_prior_model_40_10 = functools.partial(
beta_prior_model, dataset=dataset_10, alpha=40, beta=10
)
model_2_40_10 = tfd.JointDistributionCoroutineAutoBatched(
concrete_beta_prior_model_40_10
)
model_2_40_10.sample()
model_2_40_10.prob(Theta=0.1, Coin=[0, 0, 0, 0, 0, 0, 0, 0, 1, 1])
th = tf.Variable(0.2)
target_log_prob_fn = lambda th: model_2_40_10.log_prob(Theta=th, Coin=dataset_10)
x_s = tf.linspace(0.0, 1.0, 1000)
y_s = -target_log_prob_fn(x_s)
plt.plot(x_s, y_s)
plt.xlabel(r"$\theta$")
plt.ylabel("- Joint Log Prob \n(Unnormalized)")
sns.despine()
trace = tfp.math.minimize(
lambda: -target_log_prob_fn(th),
optimizer=tf.optimizers.Adam(learning_rate=0.01),
# trace_fn=trace_fn,
num_steps=200,
)
th
We now see that our MAP estimate for $\theta$ is 0.25, which is very far from the MLE. Choosing a better prior would have led to better estimates. Or, if we had more data, the likelihood would have dominated over the prior resulting in better estimates.
concrete_beta_prior_model_1_1 = functools.partial(
beta_prior_model, dataset=dataset_10, alpha=1, beta=1
)
model_2_1_1 = tfd.JointDistributionCoroutineAutoBatched(concrete_beta_prior_model_1_1)
th = tf.Variable(0.2)
target_log_prob_fn = lambda th: model_2_1_1.log_prob(Theta=th, Coin=dataset_10)
trace = tfp.math.minimize(
lambda: -target_log_prob_fn(th),
optimizer=tf.optimizers.Adam(learning_rate=0.01),
# trace_fn=trace_fn,
num_steps=200,
)
th
Our estimate for $\theta$ is more reasonable now.
q_alpha = tf.Variable(1.0)
q_beta = tf.Variable(1.0)
surrogate_posterior = tfd.Beta(concentration0=q_alpha, concentration1=q_beta, name="q")
surrogate_posterior.sample()
losses = tfp.vi.fit_surrogate_posterior(
target_log_prob_fn,
surrogate_posterior=surrogate_posterior,
optimizer=tf.optimizers.Adam(learning_rate=0.005),
num_steps=400,
)
plt.plot(losses)
plt.xlabel("Iterations")
plt.ylabel("Loss")
sns.despine()
q_alpha, q_beta
sns.kdeplot(surrogate_posterior.sample(500).numpy(), bw_adjust=2)
sns.despine()
plt.xlabel(r"$\theta$")
model_2_1_1.sample(Theta=0.1)
model_2_1_1.sample(Theta=0.9)
model_2_1_1.sample(Coin=[0, 1, 1, 0, 1, 1, 1, 0])
model_2_1_1.sample(Coin=[0, 1, 1, 0, 1, 1, 1, 0])
As we see above, we can get different $\theta$. If our dataset was large, this effect would be less pronounced.
c = model_2_1_1.sample(Theta=surrogate_posterior.sample(1000)).Coin
pd.DataFrame(c)
sns.histplot(tf.reduce_sum(tf.cast(c, tf.float32), axis=1), bins=11)
sns.despine()
We can see the count of number of heads in 1000 samples generated from the posterior.
References (incomplete as of now)
- Excellent repo and video playlist (Playlist 1, Playlist 2) by Felix
- Probabilistic PCA tutorial on TFP
- Discussion on joint log prob on TFP