import torch
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
sns.reset_defaults()="talk", font_scale=1)
sns.set_context(context%matplotlib inline
%config InlineBackend.figure_format='retina'
= torch.distributions dist
Creating a 1d normal distribution
= dist.Normal(loc=0.0, scale=1.0) uv_normal
Sampling from the distribution
= uv_normal.sample(sample_shape=[100]) samples
=2)
sns.kdeplot(samples, bw_adjust sns.despine()
Defining the prior
= torch.tensor(5.0, requires_grad=True)
prior_mu = dist.Normal(loc=prior_mu, scale=1.0)
prior prior
Normal(loc: 5.0, scale: 1.0)
Computing logprob of prior for a mu
def logprob_prior(mu):
return -prior.log_prob(mu)
Computing logprob of observing data given a mu
= 1.0
stdev_likelihood
def log_likelihood(mu, samples):
= torch.distributions.Normal(loc=mu, scale=stdev_likelihood)
to_learn return -torch.sum(to_learn.log_prob(samples))
= torch.tensor(-2.0, requires_grad=True)
mu
log_likelihood(mu, samples), logprob_prior(mu) log_likelihood(mu, samples).item()
305.98101806640625
= {"Logprob_Prior": {}, "LogLikelihood": {}}
out for mu_s in torch.linspace(-10, 10, 100):
= mu_s.item()
t = torch.tensor(mu_s)
mu "Logprob_Prior"][t] = logprob_prior(mu).item()
out["LogLikelihood"][t] = log_likelihood(mu, samples).item() out[
/var/folders/1x/wmgn24mn1bbd2vgbqlk98tbc0000gn/T/ipykernel_73152/3102909564.py:4: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
mu = torch.tensor(mu_s)
=True) pd.DataFrame(out).plot(subplots
array([<AxesSubplot:>, <AxesSubplot:>], dtype=object)
def loss(mu):
return log_likelihood(mu, samples) + logprob_prior(mu)
= torch.tensor(2.0, requires_grad=True)
mu
= torch.optim.Adam([mu], lr=0.01)
opt for i in range(1500):
= loss(mu)
loss_val
loss_val.backward()if i % 100 == 0:
print(f"Iteration: {i}, Loss: {loss_val.item():0.2f}, Loc: {mu.item():0.6f}")
opt.step() opt.zero_grad()
Iteration: 0, Loss: 374.37, Loc: 2.000000
Iteration: 100, Loss: 222.93, Loc: 1.092788
Iteration: 200, Loss: 166.98, Loc: 0.468122
Iteration: 300, Loss: 152.88, Loc: 0.119012
Iteration: 400, Loss: 150.57, Loc: -0.034995
Iteration: 500, Loss: 150.33, Loc: -0.088207
Iteration: 600, Loss: 150.31, Loc: -0.102667
Iteration: 700, Loss: 150.31, Loc: -0.105761
Iteration: 800, Loss: 150.31, Loc: -0.106279
Iteration: 900, Loss: 150.31, Loc: -0.106346
Iteration: 1000, Loss: 150.31, Loc: -0.106352
Iteration: 1100, Loss: 150.31, Loc: -0.106353
Iteration: 1200, Loss: 150.31, Loc: -0.106353
Iteration: 1300, Loss: 150.31, Loc: -0.106353
Iteration: 1400, Loss: 150.31, Loc: -0.106353
Analytical MAP estimate of location
\(\hat{\theta}_{MAP}=\dfrac{n}{n+\sigma^{2}} \bar{x}+\dfrac{\sigma^{2}}{n+\sigma^{2}} \mu\)
prior_mu
tensor(5., requires_grad=True)
= samples.shape[0]
n = samples.mean()
sample_mean = n + stdev_likelihood**2
n_plus_variance
= ((n * sample_mean) / n_plus_variance) + (
loc_map **2) / (n_plus_variance)
(stdev_likelihood* prior_mu
) loc_map.item()
-0.1063527911901474
torch.allclose(loc_map, mu)
True
Setting 2: Learning location and scale
An important difference from the previous code is that we need to use a transformed variable to ensure scale is positive. We do so by using softplus
.
= torch.tensor(1.0, requires_grad=True)
mu = torch.tensor(2.0, requires_grad=True)
scale
def log_likelihood(mu, scale, samples):
= torch.functional.F.softplus(scale)
scale_softplus = torch.distributions.Normal(loc=mu, scale=scale_softplus)
to_learn return -torch.sum(to_learn.log_prob(samples))
def loss(mu, scale):
return log_likelihood(mu, scale, samples) + logprob_prior(mu)
= torch.optim.Adam([mu, scale], lr=0.01)
opt for i in range(1500):
= loss(mu, scale)
loss_val
loss_val.backward()if i % 100 == 0:
print(
f"Iteration: {i}, Loss: {loss_val.item():0.2f}, Loc: {mu.item():0.3f}, Scale: {torch.functional.F.softplus(scale).item():0.3f}"
)
opt.step() opt.zero_grad()
Iteration: 0, Loss: 200.89, Loc: 1.000, Scale: 2.127
Iteration: 100, Loss: 158.51, Loc: 0.086, Scale: 1.282
Iteration: 200, Loss: 149.98, Loc: -0.112, Scale: 0.942
Iteration: 300, Loss: 149.98, Loc: -0.112, Scale: 0.943
Iteration: 400, Loss: 149.98, Loc: -0.112, Scale: 0.943
Iteration: 500, Loss: 149.98, Loc: -0.112, Scale: 0.943
Iteration: 600, Loss: 149.98, Loc: -0.112, Scale: 0.943
Iteration: 700, Loss: 149.98, Loc: -0.112, Scale: 0.943
Iteration: 800, Loss: 149.98, Loc: -0.112, Scale: 0.943
Iteration: 900, Loss: 149.98, Loc: -0.112, Scale: 0.943
Iteration: 1000, Loss: 149.98, Loc: -0.112, Scale: 0.943
Iteration: 1100, Loss: 149.98, Loc: -0.112, Scale: 0.943
Iteration: 1200, Loss: 149.98, Loc: -0.112, Scale: 0.943
Iteration: 1300, Loss: 149.98, Loc: -0.112, Scale: 0.943
Iteration: 1400, Loss: 149.98, Loc: -0.112, Scale: 0.943
We can see that our gradient based methods parameters match those of the MLE computed analytically.
= dist.MultivariateNormal(
mvn =torch.tensor([1.0, 1.0]),
loc=torch.tensor([[2.0, 0.5], [0.5, 0.4]]),
covariance_matrix )
= mvn_samples = mvn.sample([1000]) mle_mvn_loc
loss
= torch.tensor([-1.0, 1.0], requires_grad=True)
loc = torch.autograd.Variable(torch.tril(torch.ones(2, 2)), requires_grad=True)
tril = torch.optim.Adam([loc, tril], lr=0.01)
opt
= dist.MultivariateNormal(
prior =torch.tensor([0.0, 0.0]),
loc=torch.tensor([[1.0, 0.0], [0.0, 1.0]])
covariance_matrix
)
def log_likelihood(loc, tril, samples):
= tril @ tril.t()
cov = torch.distributions.MultivariateNormal(loc=loc, covariance_matrix=cov)
to_learn return -torch.sum(to_learn.log_prob(samples))
def logprob_prior(loc):
return -prior.log_prob(loc)
def loss(loc, tril, samples):
return log_likelihood(loc, tril, samples) + logprob_prior(loc)
for i in range(8100):
= dist.MultivariateNormal(loc=loc, covariance_matrix=tril @ tril.t())
to_learn = loss(loc, tril, mvn_samples)
loss_value
loss_value.backward()if i % 500 == 0:
print(f"Iteration: {i}, Loss: {loss_value.item():0.2f}, Loc: {loc}")
opt.step() opt.zero_grad()
Iteration: 0, Loss: 7663.86, Loc: tensor([-1., 1.], requires_grad=True)
Iteration: 500, Loss: 2540.96, Loc: tensor([0.8229, 0.9577], requires_grad=True)
Iteration: 1000, Loss: 2526.40, Loc: tensor([1.0300, 1.0076], requires_grad=True)
Iteration: 1500, Loss: 2526.40, Loc: tensor([1.0308, 1.0077], requires_grad=True)
Iteration: 2000, Loss: 2526.40, Loc: tensor([1.0308, 1.0077], requires_grad=True)
Iteration: 2500, Loss: 2526.40, Loc: tensor([1.0308, 1.0077], requires_grad=True)
Iteration: 3000, Loss: 2526.40, Loc: tensor([1.0308, 1.0077], requires_grad=True)
Iteration: 3500, Loss: 2526.40, Loc: tensor([1.0308, 1.0077], requires_grad=True)
Iteration: 4000, Loss: 2526.40, Loc: tensor([1.0308, 1.0077], requires_grad=True)
Iteration: 4500, Loss: 2526.40, Loc: tensor([1.0308, 1.0077], requires_grad=True)
Iteration: 5000, Loss: 2526.40, Loc: tensor([1.0308, 1.0077], requires_grad=True)
Iteration: 5500, Loss: 2526.40, Loc: tensor([1.0308, 1.0077], requires_grad=True)
Iteration: 6000, Loss: 2526.40, Loc: tensor([1.0308, 1.0077], requires_grad=True)
Iteration: 6500, Loss: 2526.40, Loc: tensor([1.0308, 1.0077], requires_grad=True)
Iteration: 7000, Loss: 2526.40, Loc: tensor([1.0308, 1.0077], requires_grad=True)
Iteration: 7500, Loss: 2526.40, Loc: tensor([1.0308, 1.0077], requires_grad=True)
Iteration: 8000, Loss: 2526.40, Loc: tensor([1.0308, 1.0077], requires_grad=True)
@tril.t(),mvn.covariance_matrix, prior.covariance_matrix tril
(tensor([[1.9699, 0.4505],
[0.4505, 0.3737]], grad_fn=<MmBackward0>),
tensor([[2.0000, 0.5000],
[0.5000, 0.4000]]),
tensor([[1., 0.],
[0., 1.]]))
Todo
1. Expand on MVN case
2. Clean up code
3. Visualize, prior, likelihood, MLE, MAP
4. Shrinkage estimation (reference Murphy book)
5. Inverse Wishart distribution
References
- https://stats.stackexchange.com/questions/351549/maximum-likelihood-estimators-multivariate-gaussian
- https://forum.pyro.ai/t/mle-for-normal-distribution-parameters/3861/3
- https://ericmjl.github.io/notes/stats-ml/estimating-a-multivariate-gaussians-parameters-by-gradient-descent/
- https://www.youtube.com/watch?v=KogqeZ_88-g&list=PLD0F06AA0D2E8FFBA&index=32