Lecture 4: Model Selection & Ensembles

Interactive Demo Notebook

In this notebook, we’ll explore: 1. Overfitting vs Underfitting - The fundamental tradeoff 2. Bias-Variance Tradeoff - Why models fail 3. Cross-Validation - Reliable evaluation 4. Hyperparameter Tuning - Finding the best settings 5. Ensemble Methods - Combining models

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.datasets import load_iris, make_classification
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
print("Ready to select the best models! 🎯")

Part 1: Overfitting vs Underfitting

The classic ML mistake

# Generate data with a clear pattern + noise
np.random.seed(42)
n_samples = 30
X = np.linspace(0, 10, n_samples)
y_true = np.sin(X)  # True pattern
y = y_true + np.random.randn(n_samples) * 0.3  # Add noise

X = X.reshape(-1, 1)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit models with different complexity
degrees = [1, 4, 15]  # Underfit, Just right, Overfit
titles = ['Underfitting (Too Simple)', 'Good Fit', 'Overfitting (Too Complex)']
colors = ['#e85a4f', '#2a9d8f', '#e9c46a']

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

X_plot = np.linspace(0, 10, 100).reshape(-1, 1)

for ax, degree, title, color in zip(axes, degrees, titles, colors):
    # Create polynomial features
    poly = PolynomialFeatures(degree)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)
    X_plot_poly = poly.transform(X_plot)
    
    # Fit model
    model = LinearRegression()
    model.fit(X_train_poly, y_train)
    
    # Calculate errors
    train_pred = model.predict(X_train_poly)
    test_pred = model.predict(X_test_poly)
    train_error = np.mean((train_pred - y_train)**2)
    test_error = np.mean((test_pred - y_test)**2)
    
    # Plot
    ax.scatter(X_train, y_train, c='#1e3a5f', s=60, label='Train', edgecolors='white')
    ax.scatter(X_test, y_test, c='#e85a4f', s=60, marker='s', label='Test', edgecolors='white')
    ax.plot(X_plot, model.predict(X_plot_poly), color=color, linewidth=2, label=f'Degree {degree}')
    
    ax.set_title(f'{title}\nTrain Error: {train_error:.3f}, Test Error: {test_error:.3f}', fontsize=11)
    ax.set_xlabel('X')
    ax.set_ylabel('y')
    ax.legend(fontsize=9)
    ax.set_ylim(-2, 2)

plt.tight_layout()
plt.show()

print("💡 Key insight:")
print("   Underfit: High train error, High test error")
print("   Overfit:  Low train error, HIGH test error!")
print("   Good fit: Low train error, Low test error")

Part 2: Bias-Variance Tradeoff

# Show how train and test error change with model complexity
degrees = range(1, 16)
train_errors = []
test_errors = []

for degree in degrees:
    poly = PolynomialFeatures(degree)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)
    
    model = Ridge(alpha=0.01)  # Small regularization for stability
    model.fit(X_train_poly, y_train)
    
    train_errors.append(np.mean((model.predict(X_train_poly) - y_train)**2))
    test_errors.append(np.mean((model.predict(X_test_poly) - y_test)**2))

plt.figure(figsize=(10, 5))
plt.plot(degrees, train_errors, 'b-o', label='Training Error', linewidth=2, markersize=8)
plt.plot(degrees, test_errors, 'r-s', label='Test Error', linewidth=2, markersize=8)

# Mark the sweet spot
best_degree = degrees[np.argmin(test_errors)]
plt.axvline(x=best_degree, color='green', linestyle='--', linewidth=2, label=f'Best (degree={best_degree})')

# Add regions
plt.fill_between([1, 4], 0, 1.5, alpha=0.1, color='blue', label='Underfitting zone')
plt.fill_between([8, 15], 0, 1.5, alpha=0.1, color='red', label='Overfitting zone')

plt.xlabel('Model Complexity (Polynomial Degree)', fontsize=12)
plt.ylabel('Mean Squared Error', fontsize=12)
plt.title('Bias-Variance Tradeoff', fontsize=14)
plt.legend(loc='upper right')
plt.ylim(0, 1.5)
plt.grid(True, alpha=0.3)
plt.show()

print("💡 The test error has a U-shape:")
print("   Too simple → High bias (underfitting)")
print("   Too complex → High variance (overfitting)")

Part 3: Cross-Validation

More reliable than a single train/test split

# Visualize K-Fold Cross Validation
from sklearn.model_selection import KFold

n_samples = 20
n_folds = 5
kf = KFold(n_splits=n_folds)

fig, ax = plt.subplots(figsize=(12, 4))

for fold, (train_idx, val_idx) in enumerate(kf.split(range(n_samples))):
    # Training samples
    for idx in train_idx:
        rect = plt.Rectangle((idx, fold), 0.9, 0.8, facecolor='#1e3a5f', edgecolor='white')
        ax.add_patch(rect)
    
    # Validation samples
    for idx in val_idx:
        rect = plt.Rectangle((idx, fold), 0.9, 0.8, facecolor='#e85a4f', edgecolor='white')
        ax.add_patch(rect)

ax.set_xlim(-0.5, n_samples + 0.5)
ax.set_ylim(-0.5, n_folds + 0.5)
ax.set_xlabel('Sample Index', fontsize=12)
ax.set_ylabel('Fold', fontsize=12)
ax.set_yticks(range(n_folds))
ax.set_yticklabels([f'Fold {i+1}' for i in range(n_folds)])
ax.set_title('5-Fold Cross Validation', fontsize=14)

# Legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='#1e3a5f', label='Training'),
                   Patch(facecolor='#e85a4f', label='Validation')]
ax.legend(handles=legend_elements, loc='upper right')

plt.tight_layout()
plt.show()

print("💡 Every sample gets to be in the validation set exactly once!")

# Use cross-validation in practice
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
X, y = iris.data, iris.target

# Compare different max_depth values
depths = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None]
cv_scores = []

for depth in depths:
    tree = DecisionTreeClassifier(max_depth=depth, random_state=42)
    scores = cross_val_score(tree, X, y, cv=5)
    cv_scores.append({'depth': str(depth) if depth else 'None', 
                      'mean': scores.mean(), 
                      'std': scores.std()})

# Visualize
plt.figure(figsize=(10, 5))
depth_labels = [s['depth'] for s in cv_scores]
means = [s['mean'] for s in cv_scores]
stds = [s['std'] for s in cv_scores]

plt.bar(depth_labels, means, yerr=stds, capsize=5, color='#2a9d8f', edgecolor='white', linewidth=2)
plt.xlabel('Max Depth', fontsize=12)
plt.ylabel('Cross-Validation Accuracy', fontsize=12)
plt.title('Finding the Best Tree Depth with CV', fontsize=14)
plt.ylim(0.8, 1.0)

# Mark the best
best_idx = np.argmax(means)
plt.gca().patches[best_idx].set_facecolor('#1e3a5f')

plt.show()

print(f"\n🎯 Best max_depth: {depth_labels[best_idx]} with accuracy {means[best_idx]:.1%} (±{stds[best_idx]:.1%})")

Part 4: Grid Search - Systematic Hyperparameter Tuning

from sklearn.svm import SVC

# Define parameter grid
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.01, 0.1, 1]
}

# Grid Search
svm = SVC(kernel='rbf')
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)

# Visualize results as heatmap
results = grid_search.cv_results_
scores = np.array(results['mean_test_score']).reshape(3, 3)

plt.figure(figsize=(8, 6))
im = plt.imshow(scores, cmap='YlGn', aspect='auto')
plt.colorbar(im, label='Accuracy')

# Labels
plt.xticks([0, 1, 2], param_grid['gamma'])
plt.yticks([0, 1, 2], param_grid['C'])
plt.xlabel('Gamma', fontsize=12)
plt.ylabel('C', fontsize=12)
plt.title('Grid Search Results', fontsize=14)

# Add text annotations
for i in range(3):
    for j in range(3):
        text = plt.text(j, i, f'{scores[i, j]:.2%}',
                       ha='center', va='center', color='black', fontsize=11)

plt.show()

print(f"\n🎯 Best parameters: {grid_search.best_params_}")
print(f"   Best accuracy: {grid_search.best_score_:.1%}")

Part 5: Ensemble Methods - The Power of Many

# Compare single tree vs Random Forest
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Single tree
single_tree = DecisionTreeClassifier(random_state=42)
single_tree.fit(X_train, y_train)
single_acc = single_tree.score(X_test, y_test)

# Random Forest (many trees)
forest = RandomForestClassifier(n_estimators=100, random_state=42)
forest.fit(X_train, y_train)
forest_acc = forest.score(X_test, y_test)

# Visualize
fig, ax = plt.subplots(figsize=(8, 5))
models = ['Single Tree', 'Random Forest\n(100 trees)']
accuracies = [single_acc, forest_acc]
colors = ['#e85a4f', '#2a9d8f']

bars = ax.bar(models, accuracies, color=colors, edgecolor='white', linewidth=2)
ax.set_ylabel('Test Accuracy', fontsize=12)
ax.set_title('Single Tree vs Random Forest', fontsize=14)
ax.set_ylim(0.8, 1.0)

# Add value labels
for bar, acc in zip(bars, accuracies):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
            f'{acc:.1%}', ha='center', fontsize=14, fontweight='bold')

plt.show()

print("💡 Random Forest combines many trees, each trained on different subsets!")
print("   The wisdom of crowds often beats a single expert.")

# Feature importance from Random Forest
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 5))
plt.bar(range(4), importances[indices], color='#1e3a5f', edgecolor='white', linewidth=2)
plt.xticks(range(4), [iris.feature_names[i] for i in indices], rotation=15)
plt.xlabel('Feature', fontsize=12)
plt.ylabel('Importance', fontsize=12)
plt.title('Feature Importance (Random Forest)', fontsize=14)
plt.show()

print(f"\n💡 Most important feature: {iris.feature_names[indices[0]]}")

🎯 Exercises

# Exercise 1: Try different numbers of trees (10, 50, 100, 500)
# How does accuracy change? Is there a point of diminishing returns?

# Exercise 2: Use GridSearchCV to find the best parameters for Random Forest
# Try: n_estimators=[10, 50, 100], max_depth=[3, 5, 10, None]

Summary

Concept	What We Learned
Overfitting	Model too complex, memorizes noise
Underfitting	Model too simple, misses pattern
Cross-Validation	More reliable than single split
Grid Search	Systematic hyperparameter search
Random Forest	Many trees > single tree