Residual Connections in PyTorch

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import matplotlib.pyplot as plt

from torch.utils.data import DataLoader, TensorDataset

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Regression dataset from sklearn diabetes dataset

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

X, y = load_diabetes(return_X_y=True)

# Train, validation, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Convert to torch tensors
X_train = torch.from_numpy(X_train).float()
X_val = torch.from_numpy(X_val).float()
X_test = torch.from_numpy(X_test).float()

y_train = torch.from_numpy(y_train).float().view(-1, 1)
y_val = torch.from_numpy(y_val).float().view(-1, 1)
y_test = torch.from_numpy(y_test).float().view(-1, 1)

# Create dataloaders
train_loader = DataLoader(list(zip(X_train, y_train)), batch_size=32, shuffle=True)
val_loader = DataLoader(list(zip(X_val, y_val)), batch_size=32, shuffle=False)
test_loader = DataLoader(list(zip(X_test, y_test)), batch_size=32, shuffle=False)

# Model 1: Linear regression

class LinearRegression(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.linear = nn.Linear(input_dim, 1)
        
    def forward(self, x):
        return self.linear(x)

def train(model, optimizer, criterion = nn.MSELoss(), train_loader=None, val_loader=None, epochs=1000):
    train_losses = []
    val_losses = []
    for e in range(epochs):
        train_loss = 0
        val_loss = 0
        model.train()
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            output = model(batch_x)
            loss = criterion(output, batch_y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * batch_x.size(0)
        train_loss /= len(train_loader.dataset)
        train_losses.append(train_loss)

        model.eval()
        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                output = model(batch_x)
                loss = criterion(output, batch_y)
                val_loss += loss.item() * batch_x.size(0)
            val_loss /= len(val_loader.dataset)
            val_losses.append(val_loss)

        if (e+1) % 100 == 0:
            print(f'Epoch {e+1}/{epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')
    
    return train_losses, val_losses

criterion = nn.MSELoss()

model = LinearRegression(X_train.shape[1])
optimizer = optim.Adam(model.parameters(), lr=0.1)

train_losses, val_losses = train(model, optimizer, criterion, train_loader, val_loader, epochs=1000)

plt.plot(train_losses, label='Training loss')
plt.plot(val_losses, label='Validation loss')

Epoch 100/1000, Train Loss: 10315.2895, Validation Loss: 12724.1896
Epoch 200/1000, Train Loss: 4804.3067, Validation Loss: 5319.9323
Epoch 300/1000, Train Loss: 3715.5066, Validation Loss: 3480.5550
Epoch 400/1000, Train Loss: 3388.3811, Validation Loss: 3083.5462
Epoch 500/1000, Train Loss: 3194.2672, Validation Loss: 2970.7406
Epoch 600/1000, Train Loss: 3075.9221, Validation Loss: 2929.4660
Epoch 700/1000, Train Loss: 3007.6152, Validation Loss: 2911.9031
Epoch 800/1000, Train Loss: 2970.0181, Validation Loss: 2906.4726
Epoch 900/1000, Train Loss: 2948.9850, Validation Loss: 2904.5393
Epoch 1000/1000, Train Loss: 2937.7591, Validation Loss: 2901.6748

# RMSE on test set

def rmse(model, test_loader):
    model.eval()
    with torch.no_grad():
        test_loss = 0
        for batch_x, batch_y in test_loader:
            output = model(batch_x)
            loss = criterion(output, batch_y)
            test_loss += loss.item() * batch_x.size(0)
        test_loss /= len(test_loader.dataset)
    return np.sqrt(test_loss)

errors = {}

errors["linreg"] = rmse(model, test_loader)

# Model 2: MLP

class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, 10)
        self.fc4 = nn.Linear(10, 1)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return self.fc4(x)

mlp = MLP(X_train.shape[1])
optimizer = optim.Adam(mlp.parameters(), lr=0.001)

train_losses, val_losses = train(mlp, optimizer, criterion, train_loader, val_loader, epochs=1500)

Epoch 100/1500, Train Loss: 3663.8245, Validation Loss: 3086.0981
Epoch 200/1500, Train Loss: 3077.9123, Validation Loss: 2912.6680
Epoch 300/1500, Train Loss: 2952.4684, Validation Loss: 2906.0980
Epoch 400/1500, Train Loss: 2920.8557, Validation Loss: 2916.5231
Epoch 500/1500, Train Loss: 2908.5543, Validation Loss: 2903.3417
Epoch 600/1500, Train Loss: 2898.7857, Validation Loss: 2920.6625
Epoch 700/1500, Train Loss: 2888.3550, Validation Loss: 2886.4415
Epoch 800/1500, Train Loss: 2874.1253, Validation Loss: 2905.2355
Epoch 900/1500, Train Loss: 2866.0193, Validation Loss: 2915.6193
Epoch 1000/1500, Train Loss: 2858.1495, Validation Loss: 2914.3357
Epoch 1100/1500, Train Loss: 2849.0639, Validation Loss: 2919.2359
Epoch 1200/1500, Train Loss: 2835.6841, Validation Loss: 2927.5912
Epoch 1300/1500, Train Loss: 2814.3485, Validation Loss: 2942.1452
Epoch 1400/1500, Train Loss: 2802.6443, Validation Loss: 2928.9871
Epoch 1500/1500, Train Loss: 2789.4206, Validation Loss: 2943.3993

plt.plot(train_losses, label='Train')
plt.plot(val_losses, label='Validation')

errors["MLP"]= rmse(mlp, test_loader)
errors

{'linreg': 53.8179193310273, 'MLP': 51.32108141562702}

# Model 3: Residual MLP

# Adding a skip connection from input to output

class ResMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 32)
        self.fc2 = nn.Linear(32 + input_dim, 16)
        self.fc3 = nn.Linear(16 + input_dim, 10)
        self.fc4 = nn.Linear(10, 1)
        
    def forward(self, x):
        x_inp = x
        x = F.relu(self.fc1(x)) 
        # Concatenate input to output of first layer
        x = torch.cat((x, x_inp), dim=1)
        x = F.relu(self.fc2(x))

        # Concatenate input to output of second layer
        x = torch.cat((x, x_inp), dim=1)
        x = F.relu(self.fc3(x))

        return self.fc4(x)

res_mlp = ResMLP(X_train.shape[1])
res_mlp(X_train).shape

torch.Size([282, 1])

res_mlp = ResMLP(X_train.shape[1])
optimizer = optim.Adam(res_mlp.parameters(), lr=0.001)

train_losses, val_losses = train(res_mlp, optimizer, criterion, train_loader, val_loader, epochs=1500)

Epoch 100/1500, Train Loss: 3298.6719, Validation Loss: 2937.0046
Epoch 200/1500, Train Loss: 2958.9598, Validation Loss: 2908.0027
Epoch 300/1500, Train Loss: 2933.4946, Validation Loss: 2915.9148
Epoch 400/1500, Train Loss: 2924.1275, Validation Loss: 2885.2503
Epoch 500/1500, Train Loss: 2918.3424, Validation Loss: 2883.3218
Epoch 600/1500, Train Loss: 2913.7633, Validation Loss: 2891.1281
Epoch 700/1500, Train Loss: 2909.4774, Validation Loss: 2875.5787
Epoch 800/1500, Train Loss: 2906.8811, Validation Loss: 2885.1002
Epoch 900/1500, Train Loss: 2896.1299, Validation Loss: 2894.3401
Epoch 1000/1500, Train Loss: 2881.0517, Validation Loss: 2883.1614
Epoch 1100/1500, Train Loss: 2859.3730, Validation Loss: 2900.0346
Epoch 1200/1500, Train Loss: 2843.1591, Validation Loss: 2892.0867
Epoch 1300/1500, Train Loss: 2825.5589, Validation Loss: 2935.1988
Epoch 1400/1500, Train Loss: 2807.9256, Validation Loss: 2898.6540
Epoch 1500/1500, Train Loss: 2798.3861, Validation Loss: 2901.6544

plt.plot(train_losses, label='Train')
plt.plot(val_losses, label='Validation')

errors["resnet"] = rmse(res_mlp, test_loader)
errors

{'linreg': 53.8179193310273,
 'MLP': 51.32108141562702,
 'resnet': 51.87944353261233}

# Adding LR scheduler
def train(model, optimizer, criterion=nn.MSELoss(), epochs=1000, lr_scheduler=None):
    train_losses = []
    val_losses = []

    for e in range(epochs):
        # Training loop
        train_loss = 0
        model.train()  # set the model to training mode
        for X_train_batch, y_train_batch in train_loader:
            optimizer.zero_grad()
            y_train_pred = model(X_train_batch)
            loss = criterion(y_train_pred, y_train_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_losses.append(train_loss / len(train_loader))

        # Validation loop
        val_loss = 0
        model.eval()  # set the model to evaluation mode
        with torch.no_grad():
            for X_val_batch, y_val_batch in val_loader:
                y_val_pred = model(X_val_batch)
                loss = criterion(y_val_pred, y_val_batch)
                val_loss += loss.item()
            val_losses.append(val_loss / len(val_loader))

        # Update learning rate
        if lr_scheduler is not None:
            lr_scheduler.step(val_losses[-1])

        # Print progress
        if e % 100 == 0:
            print(f"Epoch {e}, Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}")

    return train_losses, val_losses

# Use a LRPlateau scheduler to reduce the learning rate when the validation loss stops improving

from torch.optim.lr_scheduler import ReduceLROnPlateau

res_mlp = ResMLP(X_train.shape[1])

optimizer = optim.Adam(res_mlp.parameters(), lr=0.01)

lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=50, verbose=True)

train_losses, val_losses = train(res_mlp, optimizer, criterion, epochs=1500, lr_scheduler=lr_scheduler)

Epoch 0, Train Loss: 28639.6855, Val Loss: 33158.0742
Epoch 100, Train Loss: 2911.0527, Val Loss: 2420.7449
Epoch 00141: reducing learning rate of group 0 to 5.0000e-03.
Epoch 00192: reducing learning rate of group 0 to 2.5000e-03.
Epoch 200, Train Loss: 2815.0154, Val Loss: 2422.1171
Epoch 00243: reducing learning rate of group 0 to 1.2500e-03.
Epoch 00294: reducing learning rate of group 0 to 6.2500e-04.
Epoch 300, Train Loss: 2783.4652, Val Loss: 2429.3894
Epoch 00345: reducing learning rate of group 0 to 3.1250e-04.
Epoch 00396: reducing learning rate of group 0 to 1.5625e-04.
Epoch 400, Train Loss: 2794.7555, Val Loss: 2432.0923
Epoch 00447: reducing learning rate of group 0 to 7.8125e-05.
Epoch 00498: reducing learning rate of group 0 to 3.9063e-05.
Epoch 500, Train Loss: 2799.6670, Val Loss: 2432.4159
Epoch 00549: reducing learning rate of group 0 to 1.9531e-05.
Epoch 00600: reducing learning rate of group 0 to 9.7656e-06.
Epoch 600, Train Loss: 2773.7532, Val Loss: 2431.9507
Epoch 00651: reducing learning rate of group 0 to 4.8828e-06.
Epoch 700, Train Loss: 2789.9358, Val Loss: 2431.6210
Epoch 00702: reducing learning rate of group 0 to 2.4414e-06.
Epoch 00753: reducing learning rate of group 0 to 1.2207e-06.
Epoch 800, Train Loss: 2809.1560, Val Loss: 2431.6233
Epoch 00804: reducing learning rate of group 0 to 6.1035e-07.
Epoch 00855: reducing learning rate of group 0 to 3.0518e-07.
Epoch 900, Train Loss: 2825.4572, Val Loss: 2431.6108
Epoch 00906: reducing learning rate of group 0 to 1.5259e-07.
Epoch 00957: reducing learning rate of group 0 to 7.6294e-08.
Epoch 1000, Train Loss: 2780.1241, Val Loss: 2431.6187
Epoch 01008: reducing learning rate of group 0 to 3.8147e-08.
Epoch 01059: reducing learning rate of group 0 to 1.9073e-08.
Epoch 1100, Train Loss: 2789.0557, Val Loss: 2431.6189
Epoch 1200, Train Loss: 2816.7811, Val Loss: 2431.6187
Epoch 1300, Train Loss: 2810.0122, Val Loss: 2431.6189
Epoch 1400, Train Loss: 2782.1095, Val Loss: 2431.6191

plt.plot(train_losses, label='Train')
plt.plot(val_losses, label='Validation')

errors["resnet-lr"] = rmse(res_mlp, test_loader)

import pandas as pd
pd.Series(errors).plot.barh()