Vanilla Linear Regression

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
# Retina display
%config InlineBackend.figure_format = 'retina'
from jax import random

import warnings
warnings.filterwarnings('ignore')

plt.rcParams['figure.constrained_layout.use'] = True

import seaborn as sns
sns.set_context("notebook")

URL = "https://gist.githubusercontent.com/ucals/" + "2cf9d101992cb1b78c2cdd6e3bac6a4b/raw/"+ "43034c39052dcf97d4b894d2ec1bc3f90f3623d9/"+ "osic_pulmonary_fibrosis.csv"

train = pd.read_csv(URL)
train.head()

	Patient	Weeks	FVC	Percent	Age	Sex	SmokingStatus
0	ID00007637202177411956430	-4	2315	58.253649	79	Male	Ex-smoker
1	ID00007637202177411956430	5	2214	55.712129	79	Male	Ex-smoker
2	ID00007637202177411956430	7	2061	51.862104	79	Male	Ex-smoker
3	ID00007637202177411956430	9	2144	53.950679	79	Male	Ex-smoker
4	ID00007637202177411956430	11	2069	52.063412	79	Male	Ex-smoker

train.describe()

	Weeks	FVC	Percent	Age
count	1549.000000	1549.000000	1549.000000	1549.000000
mean	31.861846	2690.479019	77.672654	67.188509
std	23.247550	832.770959	19.823261	7.057395
min	-5.000000	827.000000	28.877577	49.000000
25%	12.000000	2109.000000	62.832700	63.000000
50%	28.000000	2641.000000	75.676937	68.000000
75%	47.000000	3171.000000	88.621065	72.000000
max	133.000000	6399.000000	153.145378	88.000000

# Number of unique patients
train['Patient'].nunique()

#Number of records per patient
train['Patient'].value_counts().reset_index()['Patient'].value_counts().sort_index().plot(kind='bar')
plt.xlabel("Number of weeks")
plt.ylabel("Number of Patients")

Text(0, 0.5, 'Number of Patients')

def chart_patient(patient_id, ax):
    data = train[train["Patient"] == patient_id]
    x = data["Weeks"]
    y = data["FVC"]
    ax.set_title(patient_id, fontsize=8)
    sns.regplot(x=x, y=y, ax=ax, ci=None, line_kws={"color": "black", "lw":2})


f, axes = plt.subplots(1, 3, sharey=True)
chart_patient("ID00007637202177411956430", axes[0])
chart_patient("ID00009637202177434476278", axes[1])
chart_patient("ID00010637202177584971671", axes[2])
f.tight_layout()

try:
    import numpyro
except ImportError:
    %pip install numpyro
    import numpyro

import numpyro.distributions as dist

sample_weeks = train["Weeks"].values
sample_fvc = train["FVC"].values

### Linear regression from scikit-learn
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(sample_weeks.reshape(-1, 1), sample_fvc)

LinearRegression()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

all_weeks = np.arange(-12, 134, 1)

# Plot the data and the regression line
plt.scatter(sample_weeks, sample_fvc, alpha=0.3)
plt.plot(all_weeks, lr.predict(all_weeks.reshape(-1, 1)), color="black", lw=2)
plt.xlabel("Weeks")
plt.ylabel("FVC")

Text(0, 0.5, 'FVC')

lr.coef_, lr.intercept_

(array([-1.48471319]), 2737.784722381955)

# Finding the mean absolute error

from sklearn.metrics import mean_absolute_error

maes = {}
maes["LinearRegression"] = mean_absolute_error(sample_fvc, lr.predict(sample_weeks.reshape(-1, 1)))
maes

{'LinearRegression': 654.8103093180237}

Pooled model

\(\alpha \sim \text{Normal}(0, 500)\)

\(\beta \sim \text{Normal}(0, 500)\)

\(\sigma \sim \text{HalfNormal}(100)\)

for i in range(N_Weeks):

\(FVC_i \sim \text{Normal}(\alpha + \beta \cdot Week_i, \sigma)\)

def pooled_model(sample_weeks, sample_fvc=None):
    α = numpyro.sample("α", dist.Normal(0., 500.))
    β = numpyro.sample("β", dist.Normal(0., 500.))
    σ = numpyro.sample("σ", dist.HalfNormal(50.))
    with numpyro.plate("samples", len(sample_weeks)):
        fvc = numpyro.sample("fvc", dist.Normal(α + β * sample_weeks, σ), obs=sample_fvc)
    return fvc

sample_weeks.shape

(1549,)

# Render the model graph
numpyro.render_model(pooled_model, model_kwargs={"sample_weeks": sample_weeks, "sample_fvc": sample_fvc},
                render_distributions=True,
                render_params=True,
                )

from sklearn.preprocessing import LabelEncoder

patient_encoder = LabelEncoder()
train["patient_code"] = patient_encoder.fit_transform(train["Patient"].values)

sample_patient_code = train["patient_code"].values

sample_patient_code

array([  0,   0,   0, ..., 175, 175, 175])

from numpyro.infer import MCMC, NUTS, Predictive


nuts_kernel = NUTS(pooled_model)

mcmc = MCMC(nuts_kernel, num_samples=4000, num_warmup=2000)
rng_key = random.PRNGKey(0)

WARNING:jax._src.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)

mcmc.run(rng_key, sample_weeks=sample_weeks, sample_fvc=sample_fvc)
posterior_samples = mcmc.get_samples()

sample: 100%|██████████| 6000/6000 [00:05<00:00, 1137.39it/s, 7 steps of size 5.24e-01. acc. prob=0.88]

import arviz as az

idata = az.from_numpyro(mcmc)
az.plot_trace(idata, compact=True);

# Summary statistics
az.summary(idata, round_to=2)

Shape validation failed: input_shape: (1, 4000), minimum_shape: (chains=2, draws=4)

	mean	sd	hdi_3%	hdi_97%	mcse_mean	mcse_sd	ess_bulk	ess_tail	r_hat
α	2724.98	33.46	2663.52	2786.90	0.74	0.52	2052.67	1952.52	NaN
β	-1.22	0.85	-2.80	0.43	0.02	0.01	1955.95	2019.17	NaN
σ	775.05	12.03	752.85	797.95	0.25	0.17	2411.55	2720.36	NaN

# Predictive distribution
predictive = Predictive(pooled_model, mcmc.get_samples())

predictive

<numpyro.infer.util.Predictive at 0x7d2774b0b0a0>

predictions = predictive(rng_key, all_weeks, None)

pd.DataFrame(predictions["fvc"]).mean().plot()

<Axes: >

plt.plot(all_weeks, predictions["fvc"].mean(axis=0))
plt.scatter(sample_weeks, sample_fvc, alpha=0.1)

<matplotlib.collections.PathCollection at 0x7d2774851d50>

# Get the mean and standard deviation of the predictions
mu = predictions["fvc"].mean(axis=0)
sigma = predictions["fvc"].std(axis=0)

# Plot the predictions
plt.plot(all_weeks, mu)
plt.fill_between(all_weeks, mu - 1.96*sigma, mu + 1.96*sigma, alpha=0.2)
plt.scatter(sample_weeks, sample_fvc, alpha=0.2)
plt.xlabel("Weeks")
plt.ylabel("FVC")

Text(0, 0.5, 'FVC')

preds_pooled  = predictive(rng_key, sample_weeks, None)['fvc']
predictions_train_pooled = preds_pooled.mean(axis=0)
std_train_pooled = preds_pooled.std(axis=0)

### Computing Mean Absolute Error and Coverage at 95% confidence interval

maes["PooledModel"] = mean_absolute_error(sample_fvc, predictions_train_pooled)
maes

{'LinearRegression': 654.8103093180237, 'PooledModel': 654.6283571306085}

### Computing the coverage at 95% confidence interval

def coverage(y_true, y_pred, sigma):
    lower = y_pred - 1.96 * sigma
    upper = y_pred + 1.96 * sigma
    return np.mean((y_true >= lower) & (y_true <= upper))
coverages = {}
coverages["pooled"] = coverage(sample_fvc, predictions_train_pooled, std_train_pooled).item()
coverages

{'pooled': 0.9399612545967102}

The above numbers are comparable, which is a good sign. Let’s see if we can do better with a personalised model.

Unpooled Model

\(\sigma \sim \text{HalfNormal}(20)\)

for p in range(N_patients):

\(\alpha_p \sim \text{Normal}(0, 500)\)

\(\beta_p \sim \text{Normal}(0, 500)\)

for PAT in range(N_patients):
    for i in range(N_Weeks):

\(FVC_i \sim \text{Normal}(\alpha_{p}[PAT] + \beta_{p}[PAT] \cdot Week_i, \sigma)\)

# Unpooled model
def unpool_model(sample_weeks, sample_patient_code, sample_fvc=None):
    σ = numpyro.sample("σ", dist.HalfNormal(20.))
    with numpyro.plate("patients", sample_patient_code.max() + 1):
        α_p = numpyro.sample("α_p", dist.Normal(0, 500.))
        β_p = numpyro.sample("β_p", dist.Normal(0, 500.))
    with numpyro.plate("samples", len(sample_weeks)):
        fvc = numpyro.sample("fvc", numpyro.distributions.Normal(α_p[sample_patient_code] +
                                                                 β_p[sample_patient_code] * sample_weeks, σ),
                             obs=sample_fvc)
    return fvc

# Render the model graph

model_kwargs = {"sample_weeks": sample_weeks,
                "sample_patient_code": sample_patient_code,
                "sample_fvc": sample_fvc}

numpyro.render_model(unpool_model, model_kwargs=model_kwargs,
                render_distributions=True,
                render_params=True,
                )

nuts_kernel_unpooled = NUTS(unpool_model)

mcmc_unpooled = MCMC(nuts_kernel_unpooled, num_samples=200, num_warmup=500)
rng_key = random.PRNGKey(0)

model_kwargs

{'sample_weeks': array([-4,  5,  7, ..., 31, 43, 59]),
 'sample_patient_code': array([  0,   0,   0, ..., 175, 175, 175]),
 'sample_fvc': array([2315, 2214, 2061, ..., 2908, 2975, 2774])}

mcmc_unpooled.run(rng_key, **model_kwargs)

posterior_samples = mcmc_unpooled.get_samples()

sample: 100%|██████████| 700/700 [00:07<00:00, 90.53it/s, 63 steps of size 9.00e-02. acc. prob=0.91]

az.plot_trace(az.from_numpyro(mcmc_unpooled), compact=True);

# Predictive distribution for unpooled model

predictive_unpooled = Predictive(unpool_model, mcmc_unpooled.get_samples())

# Predictive distribution for unpooled model for all weeks for a given patient

all_weeks = np.arange(-12, 134, 1)
def predict_unpooled(patient_code):
    predictions = predictive_unpooled(rng_key, all_weeks, patient_code)
    mu = predictions["fvc"].mean(axis=0)
    sigma = predictions["fvc"].std(axis=0)
    return mu, sigma

# Plot the predictions for a given patient
def plot_patient(patient_code):
    mu, sigma = predict_unpooled(patient_code)
    plt.plot(all_weeks, mu)
    plt.fill_between(all_weeks, mu - 1.96*sigma, mu + 1.96*sigma, alpha=0.6)
    id_to_patient = patient_encoder.inverse_transform(patient_code)[0]
    patient_weeks = train[train["Patient"] == id_to_patient]["Weeks"]
    patient_fvc = train[train["Patient"] == id_to_patient]["FVC"]
    plt.scatter(patient_weeks, patient_fvc, alpha=0.5)
    plt.xlabel("Weeks")
    plt.ylabel("FVC")
    plt.title(id_to_patient)

plot_pooled = False

def plot_total(patient_id = 0, plot_pooled = False):
  plot_patient(np.array([patient_id]))
  if plot_pooled:
      plt.plot(all_weeks, mu, color='g')
      plt.fill_between(all_weeks, mu - 1.96*sigma, mu + 1.96*sigma, alpha=0.05, color='g')

plot_total(0, True)

plot_total(0, True)

plot_total(1, True)

plot_total(3)

Computing metrics on the training set for unpool_model


predictions_train_unpooled = predictive_unpooled(rng_key,
                                                 sample_weeks,
                                                 train["patient_code"].values)['fvc']
predictions_train_unpooled.shape

(200, 1549)

mu_predictions_train_unpooled = predictions_train_unpooled.mean(axis=0)
std_predictions_train_unpooled = predictions_train_unpooled.std(axis=0)

maes["UnpooledModel"] = mean_absolute_error(sample_fvc, mu_predictions_train_unpooled)
maes

{'LinearRegression': 654.8103093180237,
 'PooledModel': 654.6283571306085,
 'UnpooledModel': 98.18583518632232}

coverages["unpooled"] = coverage(sample_fvc, mu_predictions_train_unpooled, std_predictions_train_unpooled).item()
coverages

{'pooled': 0.9399612545967102, 'unpooled': 0.974822461605072}

Exercise: Another variation of unpooled model

# Unpooled model
def unpool_model_var(sample_weeks, sample_patient_code, sample_fvc=None):
    with numpyro.plate("patients", sample_patient_code.max() + 1):
        α_p = numpyro.sample("α_p", dist.Normal(0, 500.))
        β_p = numpyro.sample("β_p", dist.Normal(0, 500.))
        σ_p = numpyro.sample("σ_p", dist.HalfNormal(20.))
    with numpyro.plate("samples", len(sample_weeks)):
        fvc = numpyro.sample("fvc", numpyro.distributions.Normal(α_p[sample_patient_code] +
                                                                 β_p[sample_patient_code] * sample_weeks,
                                                                 σ_p[sample_patient_code]),
                             obs=sample_fvc)
    return fvc

# Plot graphical model
numpyro.render_model(unpool_model_var, model_kwargs=model_kwargs,
                render_distributions=True,
                render_params=True,
                )

nuts_kernel_unpooled_2 = NUTS(unpool_model_var)

mcmc_unpooled_2 = MCMC(nuts_kernel_unpooled_2, num_samples=400, num_warmup=500)
rng_key = random.PRNGKey(0)

mcmc_unpooled_2.run(rng_key, **model_kwargs)

posterior_samples_2 = mcmc_unpooled_2.get_samples()

sample: 100%|██████████| 900/900 [00:08<00:00, 102.48it/s, 63 steps of size 8.17e-02. acc. prob=0.88]

az.plot_trace(az.from_numpyro(mcmc_unpooled_2), compact=True);

# Predictive distribution for unpooled model variation 2

predictive_unpooled_2 = Predictive(unpool_model_var, mcmc_unpooled_2.get_samples())

predictions_train_unpooled_2 = predictive_unpooled_2(rng_key,
                                                        sample_weeks,
                                                        train["patient_code"].values)['fvc']

mu_predictions_train_unpooled_2 = predictions_train_unpooled_2.mean(axis=0)
std_predictions_train_unpooled_2 = predictions_train_unpooled_2.std(axis=0)

maes["UnpooledModel2"] = mean_absolute_error(sample_fvc, mu_predictions_train_unpooled_2)

coverages["unpooled2"] = coverage(sample_fvc, mu_predictions_train_unpooled_2, std_predictions_train_unpooled_2).item()

print(maes)
print(coverages)

{'LinearRegression': 654.8103093180237, 'PooledModel': 654.6283571306085, 'UnpooledModel': 98.18583518632232, 'UnpooledModel2': 81.93005607511398}
{'pooled': 0.9399612545967102, 'unpooled': 0.974822461605072, 'unpooled2': 0.886378288269043}

Hierarchical model

\(\sigma \sim \text{HalfNormal}(100)\)

\(\mu_{\alpha} \sim \text{Normal}(0, 500)\)

\(\sigma_{\alpha} \sim \text{HalfNormal}(100)\)

\(\mu_{\beta} \sim \text{Normal}(0, 500)\)

\(\sigma_{\beta} \sim \text{HalfNormal}(100)\)

for p in range(N_patients):

\(\alpha_p \sim \text{Normal}(\mu_{\alpha}, \sigma_{\alpha})\)

\(\beta_p \sim \text{Normal}(\mu_{\beta}, \sigma_{\beta})\)

for i in range(N_Weeks):

\(FVC_i \sim \text{Normal}(\alpha_{p[i]} + \beta_{p[i]} \cdot Week_i, \sigma)\)

### Hierarchical model

def final_model(sample_weeks, sample_patient_code, sample_fvc=None):
    μ_α = numpyro.sample("μ_α", dist.Normal(0.0, 500.0))
    σ_α = numpyro.sample("σ_α", dist.HalfNormal(100.0))
    μ_β = numpyro.sample("μ_β", dist.Normal(0.0, 3.0))
    σ_β = numpyro.sample("σ_β", dist.HalfNormal(3.0))

    n_patients = len(np.unique(sample_patient_code))

    with numpyro.plate("Participants", n_patients):
        α = numpyro.sample("α", dist.Normal(μ_α, σ_α))
        β = numpyro.sample("β", dist.Normal(μ_β, σ_β))

    σ = numpyro.sample("σ", dist.HalfNormal(100.0))
    FVC_est = α[sample_patient_code] + β[sample_patient_code] * sample_weeks

    with numpyro.plate("data", len(sample_patient_code)):
        numpyro.sample("fvc", dist.Normal(FVC_est, σ), obs=sample_fvc)

# Render the model graph


numpyro.render_model(final_model, model_kwargs=model_kwargs,
                render_distributions=True,
                render_params=True,
                )

nuts_final = NUTS(final_model)

mcmc_final = MCMC(nuts_final, num_samples=4000, num_warmup=2000)
rng_key = random.PRNGKey(0)

mcmc_final.run(rng_key, **model_kwargs)

sample: 100%|██████████| 6000/6000 [00:50<00:00, 117.83it/s, 63 steps of size 1.04e-02. acc. prob=0.84]

predictive_final = Predictive(final_model, mcmc_final.get_samples())

az.plot_trace(az.from_numpyro(mcmc_final), compact=True);

predictive_hierarchical = Predictive(final_model, mcmc_final.get_samples())

predictions_train_hierarchical = predictive_hierarchical(rng_key,
                                                         sample_weeks = model_kwargs["sample_weeks"],
                                                         sample_patient_code = model_kwargs["sample_patient_code"])['fvc']

mu_predictions_train_h = predictions_train_hierarchical.mean(axis=0)
std_predictions_train_h = predictions_train_hierarchical.std(axis=0)

maes["Hierarchical"] = mean_absolute_error(sample_fvc, mu_predictions_train_h)

coverages["Hierarchical"] = coverage(sample_fvc, mu_predictions_train_h, std_predictions_train_h).item()

print(maes)
print(coverages)

{'LinearRegression': 654.8103093180237, 'PooledModel': 654.6283571306085, 'UnpooledModel': 98.18583518632232, 'UnpooledModel2': 81.93005607511398, 'Hierarchical': 83.05726212759492}
{'pooled': 0.9399612545967102, 'unpooled': 0.974822461605072, 'unpooled2': 0.886378288269043, 'Hierarchical': 0.9754680395126343}

pd.Series(maes)

LinearRegression    654.810309
PooledModel         654.628357
UnpooledModel        98.185835
UnpooledModel2       81.930056
Hierarchical         83.057262
dtype: float64

# Predict for a given patient


def predict_final(patient_code):
    predictions = predictive_final(rng_key, all_weeks, patient_code)
    mu = predictions["fvc"].mean(axis=0)
    sigma = predictions["fvc"].std(axis=0)
    return mu, sigma

# Plot the predictions for a given patient
def plot_patient_final(patient_code):
    mu, sigma = predict_final(patient_code)
    plt.plot(all_weeks, mu)
    plt.fill_between(all_weeks, mu - sigma, mu + sigma, alpha=0.1)
    id_to_patient = patient_encoder.inverse_transform([patient_code])[0]
    #print(id_to_patient[0], patient_code)
    #print(patient_code, id_to_patient)
    patient_weeks = train[train["Patient"] == id_to_patient]["Weeks"]
    patient_fvc = train[train["Patient"] == id_to_patient]["FVC"]
    plt.scatter(patient_weeks, patient_fvc, alpha=0.5)
    #plt.scatter(sample_weeks[train["patient_code"] == patient_code.item()], fvc[train["patient_code"] == patient_code.item()], alpha=0.5)
    plt.xlabel("Weeks")
    plt.ylabel("FVC")
    plt.title(patient_encoder.inverse_transform([patient_code])[0])

# plot for a given patient
plot_patient_final(np.array([0]))

plot_total(0)

Questions to ponder

Incorporating extra information
How to get sigma for sklearn model
How to predict for a new partipant in the hierarchical model
- Assuming they have some observations
  - Add the data from this participant at the training time
  - “Fine-tune” on this participant data
- Assuming they do not have any observations