import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
# Retina display
%config InlineBackend.figure_format = 'retina'
from jax import random
import warnings
'ignore')
warnings.filterwarnings(
'figure.constrained_layout.use'] = True plt.rcParams[
Vanilla Linear Regression
import seaborn as sns
"notebook") sns.set_context(
= "https://gist.githubusercontent.com/ucals/" + "2cf9d101992cb1b78c2cdd6e3bac6a4b/raw/"+ "43034c39052dcf97d4b894d2ec1bc3f90f3623d9/"+ "osic_pulmonary_fibrosis.csv" URL
= pd.read_csv(URL)
train train.head()
Patient | Weeks | FVC | Percent | Age | Sex | SmokingStatus | |
---|---|---|---|---|---|---|---|
0 | ID00007637202177411956430 | -4 | 2315 | 58.253649 | 79 | Male | Ex-smoker |
1 | ID00007637202177411956430 | 5 | 2214 | 55.712129 | 79 | Male | Ex-smoker |
2 | ID00007637202177411956430 | 7 | 2061 | 51.862104 | 79 | Male | Ex-smoker |
3 | ID00007637202177411956430 | 9 | 2144 | 53.950679 | 79 | Male | Ex-smoker |
4 | ID00007637202177411956430 | 11 | 2069 | 52.063412 | 79 | Male | Ex-smoker |
train.describe()
Weeks | FVC | Percent | Age | |
---|---|---|---|---|
count | 1549.000000 | 1549.000000 | 1549.000000 | 1549.000000 |
mean | 31.861846 | 2690.479019 | 77.672654 | 67.188509 |
std | 23.247550 | 832.770959 | 19.823261 | 7.057395 |
min | -5.000000 | 827.000000 | 28.877577 | 49.000000 |
25% | 12.000000 | 2109.000000 | 62.832700 | 63.000000 |
50% | 28.000000 | 2641.000000 | 75.676937 | 68.000000 |
75% | 47.000000 | 3171.000000 | 88.621065 | 72.000000 |
max | 133.000000 | 6399.000000 | 153.145378 | 88.000000 |
# Number of unique patients
'Patient'].nunique() train[
176
#Number of records per patient
'Patient'].value_counts().reset_index()['Patient'].value_counts().sort_index().plot(kind='bar')
train["Number of weeks")
plt.xlabel("Number of Patients") plt.ylabel(
Text(0, 0.5, 'Number of Patients')
def chart_patient(patient_id, ax):
= train[train["Patient"] == patient_id]
data = data["Weeks"]
x = data["FVC"]
y =8)
ax.set_title(patient_id, fontsize=x, y=y, ax=ax, ci=None, line_kws={"color": "black", "lw":2})
sns.regplot(x
= plt.subplots(1, 3, sharey=True)
f, axes "ID00007637202177411956430", axes[0])
chart_patient("ID00009637202177434476278", axes[1])
chart_patient("ID00010637202177584971671", axes[2])
chart_patient( f.tight_layout()
try:
import numpyro
except ImportError:
%pip install numpyro
import numpyro
import numpyro.distributions as dist
= train["Weeks"].values
sample_weeks = train["FVC"].values sample_fvc
### Linear regression from scikit-learn
from sklearn.linear_model import LinearRegression
= LinearRegression()
lr -1, 1), sample_fvc) lr.fit(sample_weeks.reshape(
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
= np.arange(-12, 134, 1) all_weeks
# Plot the data and the regression line
=0.3)
plt.scatter(sample_weeks, sample_fvc, alpha-1, 1)), color="black", lw=2)
plt.plot(all_weeks, lr.predict(all_weeks.reshape("Weeks")
plt.xlabel("FVC") plt.ylabel(
Text(0, 0.5, 'FVC')
lr.coef_, lr.intercept_
(array([-1.48471319]), 2737.784722381955)
# Finding the mean absolute error
from sklearn.metrics import mean_absolute_error
= {}
maes "LinearRegression"] = mean_absolute_error(sample_fvc, lr.predict(sample_weeks.reshape(-1, 1)))
maes[ maes
{'LinearRegression': 654.8103093180237}
Pooled model
\(\alpha \sim \text{Normal}(0, 500)\)
\(\beta \sim \text{Normal}(0, 500)\)
\(\sigma \sim \text{HalfNormal}(100)\)
for i in range(N_Weeks):
\(FVC_i \sim \text{Normal}(\alpha + \beta \cdot Week_i, \sigma)\)
def pooled_model(sample_weeks, sample_fvc=None):
= numpyro.sample("α", dist.Normal(0., 500.))
α = numpyro.sample("β", dist.Normal(0., 500.))
β = numpyro.sample("σ", dist.HalfNormal(50.))
σ with numpyro.plate("samples", len(sample_weeks)):
= numpyro.sample("fvc", dist.Normal(α + β * sample_weeks, σ), obs=sample_fvc)
fvc return fvc
sample_weeks.shape
(1549,)
# Render the model graph
={"sample_weeks": sample_weeks, "sample_fvc": sample_fvc},
numpyro.render_model(pooled_model, model_kwargs=True,
render_distributions=True,
render_params )
from sklearn.preprocessing import LabelEncoder
= LabelEncoder()
patient_encoder "patient_code"] = patient_encoder.fit_transform(train["Patient"].values) train[
= train["patient_code"].values sample_patient_code
sample_patient_code
array([ 0, 0, 0, ..., 175, 175, 175])
from numpyro.infer import MCMC, NUTS, Predictive
= NUTS(pooled_model)
nuts_kernel
= MCMC(nuts_kernel, num_samples=4000, num_warmup=2000)
mcmc = random.PRNGKey(0) rng_key
WARNING:jax._src.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
=sample_weeks, sample_fvc=sample_fvc)
mcmc.run(rng_key, sample_weeks= mcmc.get_samples() posterior_samples
sample: 100%|██████████| 6000/6000 [00:05<00:00, 1137.39it/s, 7 steps of size 5.24e-01. acc. prob=0.88]
import arviz as az
= az.from_numpyro(mcmc)
idata =True); az.plot_trace(idata, compact
# Summary statistics
=2) az.summary(idata, round_to
Shape validation failed: input_shape: (1, 4000), minimum_shape: (chains=2, draws=4)
mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
---|---|---|---|---|---|---|---|---|---|
α | 2724.98 | 33.46 | 2663.52 | 2786.90 | 0.74 | 0.52 | 2052.67 | 1952.52 | NaN |
β | -1.22 | 0.85 | -2.80 | 0.43 | 0.02 | 0.01 | 1955.95 | 2019.17 | NaN |
σ | 775.05 | 12.03 | 752.85 | 797.95 | 0.25 | 0.17 | 2411.55 | 2720.36 | NaN |
# Predictive distribution
= Predictive(pooled_model, mcmc.get_samples()) predictive
predictive
<numpyro.infer.util.Predictive at 0x7d2774b0b0a0>
= predictive(rng_key, all_weeks, None) predictions
"fvc"]).mean().plot() pd.DataFrame(predictions[
<Axes: >
"fvc"].mean(axis=0))
plt.plot(all_weeks, predictions[=0.1) plt.scatter(sample_weeks, sample_fvc, alpha
<matplotlib.collections.PathCollection at 0x7d2774851d50>
# Get the mean and standard deviation of the predictions
= predictions["fvc"].mean(axis=0)
mu = predictions["fvc"].std(axis=0)
sigma
# Plot the predictions
plt.plot(all_weeks, mu)- 1.96*sigma, mu + 1.96*sigma, alpha=0.2)
plt.fill_between(all_weeks, mu =0.2)
plt.scatter(sample_weeks, sample_fvc, alpha"Weeks")
plt.xlabel("FVC") plt.ylabel(
Text(0, 0.5, 'FVC')
= predictive(rng_key, sample_weeks, None)['fvc']
preds_pooled = preds_pooled.mean(axis=0)
predictions_train_pooled = preds_pooled.std(axis=0) std_train_pooled
### Computing Mean Absolute Error and Coverage at 95% confidence interval
"PooledModel"] = mean_absolute_error(sample_fvc, predictions_train_pooled)
maes[ maes
{'LinearRegression': 654.8103093180237, 'PooledModel': 654.6283571306085}
### Computing the coverage at 95% confidence interval
def coverage(y_true, y_pred, sigma):
= y_pred - 1.96 * sigma
lower = y_pred + 1.96 * sigma
upper return np.mean((y_true >= lower) & (y_true <= upper))
= {}
coverages "pooled"] = coverage(sample_fvc, predictions_train_pooled, std_train_pooled).item()
coverages[ coverages
{'pooled': 0.9399612545967102}
The above numbers are comparable, which is a good sign. Let’s see if we can do better with a personalised model.
Unpooled Model
\(\sigma \sim \text{HalfNormal}(20)\)
for p in range(N_patients):
\(\alpha_p \sim \text{Normal}(0, 500)\)
\(\beta_p \sim \text{Normal}(0, 500)\)
for PAT in range(N_patients):
for i in range(N_Weeks):
\(FVC_i \sim \text{Normal}(\alpha_{p}[PAT] + \beta_{p}[PAT] \cdot Week_i, \sigma)\)
# Unpooled model
def unpool_model(sample_weeks, sample_patient_code, sample_fvc=None):
= numpyro.sample("σ", dist.HalfNormal(20.))
σ with numpyro.plate("patients", sample_patient_code.max() + 1):
= numpyro.sample("α_p", dist.Normal(0, 500.))
α_p = numpyro.sample("β_p", dist.Normal(0, 500.))
β_p with numpyro.plate("samples", len(sample_weeks)):
= numpyro.sample("fvc", numpyro.distributions.Normal(α_p[sample_patient_code] +
fvc * sample_weeks, σ),
β_p[sample_patient_code] =sample_fvc)
obsreturn fvc
# Render the model graph
= {"sample_weeks": sample_weeks,
model_kwargs "sample_patient_code": sample_patient_code,
"sample_fvc": sample_fvc}
=model_kwargs,
numpyro.render_model(unpool_model, model_kwargs=True,
render_distributions=True,
render_params )
= NUTS(unpool_model)
nuts_kernel_unpooled
= MCMC(nuts_kernel_unpooled, num_samples=200, num_warmup=500)
mcmc_unpooled = random.PRNGKey(0) rng_key
model_kwargs
{'sample_weeks': array([-4, 5, 7, ..., 31, 43, 59]),
'sample_patient_code': array([ 0, 0, 0, ..., 175, 175, 175]),
'sample_fvc': array([2315, 2214, 2061, ..., 2908, 2975, 2774])}
**model_kwargs)
mcmc_unpooled.run(rng_key,
= mcmc_unpooled.get_samples() posterior_samples
sample: 100%|██████████| 700/700 [00:07<00:00, 90.53it/s, 63 steps of size 9.00e-02. acc. prob=0.91]
=True); az.plot_trace(az.from_numpyro(mcmc_unpooled), compact
# Predictive distribution for unpooled model
= Predictive(unpool_model, mcmc_unpooled.get_samples()) predictive_unpooled
# Predictive distribution for unpooled model for all weeks for a given patient
= np.arange(-12, 134, 1)
all_weeks def predict_unpooled(patient_code):
= predictive_unpooled(rng_key, all_weeks, patient_code)
predictions = predictions["fvc"].mean(axis=0)
mu = predictions["fvc"].std(axis=0)
sigma return mu, sigma
# Plot the predictions for a given patient
def plot_patient(patient_code):
= predict_unpooled(patient_code)
mu, sigma
plt.plot(all_weeks, mu)- 1.96*sigma, mu + 1.96*sigma, alpha=0.6)
plt.fill_between(all_weeks, mu = patient_encoder.inverse_transform(patient_code)[0]
id_to_patient = train[train["Patient"] == id_to_patient]["Weeks"]
patient_weeks = train[train["Patient"] == id_to_patient]["FVC"]
patient_fvc =0.5)
plt.scatter(patient_weeks, patient_fvc, alpha"Weeks")
plt.xlabel("FVC")
plt.ylabel( plt.title(id_to_patient)
= False plot_pooled
def plot_total(patient_id = 0, plot_pooled = False):
plot_patient(np.array([patient_id]))if plot_pooled:
='g')
plt.plot(all_weeks, mu, color- 1.96*sigma, mu + 1.96*sigma, alpha=0.05, color='g')
plt.fill_between(all_weeks, mu
0, True) plot_total(
0, True) plot_total(
1, True) plot_total(
3) plot_total(
Computing metrics on the training set for unpool_model
= predictive_unpooled(rng_key,
predictions_train_unpooled
sample_weeks,"patient_code"].values)['fvc']
train[ predictions_train_unpooled.shape
(200, 1549)
= predictions_train_unpooled.mean(axis=0)
mu_predictions_train_unpooled = predictions_train_unpooled.std(axis=0)
std_predictions_train_unpooled
"UnpooledModel"] = mean_absolute_error(sample_fvc, mu_predictions_train_unpooled)
maes[ maes
{'LinearRegression': 654.8103093180237,
'PooledModel': 654.6283571306085,
'UnpooledModel': 98.18583518632232}
"unpooled"] = coverage(sample_fvc, mu_predictions_train_unpooled, std_predictions_train_unpooled).item()
coverages[ coverages
{'pooled': 0.9399612545967102, 'unpooled': 0.974822461605072}
Exercise: Another variation of unpooled model
# Unpooled model
def unpool_model_var(sample_weeks, sample_patient_code, sample_fvc=None):
with numpyro.plate("patients", sample_patient_code.max() + 1):
= numpyro.sample("α_p", dist.Normal(0, 500.))
α_p = numpyro.sample("β_p", dist.Normal(0, 500.))
β_p = numpyro.sample("σ_p", dist.HalfNormal(20.))
σ_p with numpyro.plate("samples", len(sample_weeks)):
= numpyro.sample("fvc", numpyro.distributions.Normal(α_p[sample_patient_code] +
fvc * sample_weeks,
β_p[sample_patient_code]
σ_p[sample_patient_code]),=sample_fvc)
obsreturn fvc
# Plot graphical model
=model_kwargs,
numpyro.render_model(unpool_model_var, model_kwargs=True,
render_distributions=True,
render_params )
= NUTS(unpool_model_var)
nuts_kernel_unpooled_2
= MCMC(nuts_kernel_unpooled_2, num_samples=400, num_warmup=500)
mcmc_unpooled_2 = random.PRNGKey(0)
rng_key
**model_kwargs)
mcmc_unpooled_2.run(rng_key,
= mcmc_unpooled_2.get_samples() posterior_samples_2
sample: 100%|██████████| 900/900 [00:08<00:00, 102.48it/s, 63 steps of size 8.17e-02. acc. prob=0.88]
=True); az.plot_trace(az.from_numpyro(mcmc_unpooled_2), compact
# Predictive distribution for unpooled model variation 2
= Predictive(unpool_model_var, mcmc_unpooled_2.get_samples()) predictive_unpooled_2
= predictive_unpooled_2(rng_key,
predictions_train_unpooled_2
sample_weeks,"patient_code"].values)['fvc']
train[
= predictions_train_unpooled_2.mean(axis=0)
mu_predictions_train_unpooled_2 = predictions_train_unpooled_2.std(axis=0)
std_predictions_train_unpooled_2
"UnpooledModel2"] = mean_absolute_error(sample_fvc, mu_predictions_train_unpooled_2)
maes[
"unpooled2"] = coverage(sample_fvc, mu_predictions_train_unpooled_2, std_predictions_train_unpooled_2).item()
coverages[
print(maes)
print(coverages)
{'LinearRegression': 654.8103093180237, 'PooledModel': 654.6283571306085, 'UnpooledModel': 98.18583518632232, 'UnpooledModel2': 81.93005607511398}
{'pooled': 0.9399612545967102, 'unpooled': 0.974822461605072, 'unpooled2': 0.886378288269043}
Hierarchical model
\(\sigma \sim \text{HalfNormal}(100)\)
\(\mu_{\alpha} \sim \text{Normal}(0, 500)\)
\(\sigma_{\alpha} \sim \text{HalfNormal}(100)\)
\(\mu_{\beta} \sim \text{Normal}(0, 500)\)
\(\sigma_{\beta} \sim \text{HalfNormal}(100)\)
for p in range(N_patients):
\(\alpha_p \sim \text{Normal}(\mu_{\alpha}, \sigma_{\alpha})\)
\(\beta_p \sim \text{Normal}(\mu_{\beta}, \sigma_{\beta})\)
for i in range(N_Weeks):
\(FVC_i \sim \text{Normal}(\alpha_{p[i]} + \beta_{p[i]} \cdot Week_i, \sigma)\)
### Hierarchical model
def final_model(sample_weeks, sample_patient_code, sample_fvc=None):
= numpyro.sample("μ_α", dist.Normal(0.0, 500.0))
μ_α = numpyro.sample("σ_α", dist.HalfNormal(100.0))
σ_α = numpyro.sample("μ_β", dist.Normal(0.0, 3.0))
μ_β = numpyro.sample("σ_β", dist.HalfNormal(3.0))
σ_β
= len(np.unique(sample_patient_code))
n_patients
with numpyro.plate("Participants", n_patients):
= numpyro.sample("α", dist.Normal(μ_α, σ_α))
α = numpyro.sample("β", dist.Normal(μ_β, σ_β))
β
= numpyro.sample("σ", dist.HalfNormal(100.0))
σ = α[sample_patient_code] + β[sample_patient_code] * sample_weeks
FVC_est
with numpyro.plate("data", len(sample_patient_code)):
"fvc", dist.Normal(FVC_est, σ), obs=sample_fvc) numpyro.sample(
# Render the model graph
=model_kwargs,
numpyro.render_model(final_model, model_kwargs=True,
render_distributions=True,
render_params )
= NUTS(final_model)
nuts_final
= MCMC(nuts_final, num_samples=4000, num_warmup=2000)
mcmc_final = random.PRNGKey(0) rng_key
**model_kwargs) mcmc_final.run(rng_key,
sample: 100%|██████████| 6000/6000 [00:50<00:00, 117.83it/s, 63 steps of size 1.04e-02. acc. prob=0.84]
= Predictive(final_model, mcmc_final.get_samples()) predictive_final
=True); az.plot_trace(az.from_numpyro(mcmc_final), compact
= Predictive(final_model, mcmc_final.get_samples()) predictive_hierarchical
= predictive_hierarchical(rng_key,
predictions_train_hierarchical = model_kwargs["sample_weeks"],
sample_weeks = model_kwargs["sample_patient_code"])['fvc']
sample_patient_code
= predictions_train_hierarchical.mean(axis=0)
mu_predictions_train_h = predictions_train_hierarchical.std(axis=0)
std_predictions_train_h
"Hierarchical"] = mean_absolute_error(sample_fvc, mu_predictions_train_h)
maes[
"Hierarchical"] = coverage(sample_fvc, mu_predictions_train_h, std_predictions_train_h).item()
coverages[
print(maes)
print(coverages)
{'LinearRegression': 654.8103093180237, 'PooledModel': 654.6283571306085, 'UnpooledModel': 98.18583518632232, 'UnpooledModel2': 81.93005607511398, 'Hierarchical': 83.05726212759492}
{'pooled': 0.9399612545967102, 'unpooled': 0.974822461605072, 'unpooled2': 0.886378288269043, 'Hierarchical': 0.9754680395126343}
pd.Series(maes)
LinearRegression 654.810309
PooledModel 654.628357
UnpooledModel 98.185835
UnpooledModel2 81.930056
Hierarchical 83.057262
dtype: float64
# Predict for a given patient
def predict_final(patient_code):
= predictive_final(rng_key, all_weeks, patient_code)
predictions = predictions["fvc"].mean(axis=0)
mu = predictions["fvc"].std(axis=0)
sigma return mu, sigma
# Plot the predictions for a given patient
def plot_patient_final(patient_code):
= predict_final(patient_code)
mu, sigma
plt.plot(all_weeks, mu)- sigma, mu + sigma, alpha=0.1)
plt.fill_between(all_weeks, mu = patient_encoder.inverse_transform([patient_code])[0]
id_to_patient #print(id_to_patient[0], patient_code)
#print(patient_code, id_to_patient)
= train[train["Patient"] == id_to_patient]["Weeks"]
patient_weeks = train[train["Patient"] == id_to_patient]["FVC"]
patient_fvc =0.5)
plt.scatter(patient_weeks, patient_fvc, alpha#plt.scatter(sample_weeks[train["patient_code"] == patient_code.item()], fvc[train["patient_code"] == patient_code.item()], alpha=0.5)
"Weeks")
plt.xlabel("FVC")
plt.ylabel(0]) plt.title(patient_encoder.inverse_transform([patient_code])[
# plot for a given patient
0])) plot_patient_final(np.array([
0) plot_total(
Questions to ponder
- Incorporating extra information
- How to get
sigma
for sklearn model - How to predict for a new partipant in the hierarchical model
- Assuming they have some observations
- Add the data from this participant at the training time
- “Fine-tune” on this participant data
- Assuming they do not have any observations
- Assuming they have some observations