import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.datasets import load_iris, make_classification
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
print("Ready to select the best models! 🎯")Lecture 4: Model Selection & Ensembles
Interactive Demo Notebook
In this notebook, we’ll explore: 1. Overfitting vs Underfitting - The fundamental tradeoff 2. Bias-Variance Tradeoff - Why models fail 3. Cross-Validation - Reliable evaluation 4. Hyperparameter Tuning - Finding the best settings 5. Ensemble Methods - Combining models
Part 1: Overfitting vs Underfitting
The classic ML mistake
# Generate data with a clear pattern + noise
np.random.seed(42)
n_samples = 30
X = np.linspace(0, 10, n_samples)
y_true = np.sin(X) # True pattern
y = y_true + np.random.randn(n_samples) * 0.3 # Add noise
X = X.reshape(-1, 1)
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit models with different complexity
degrees = [1, 4, 15] # Underfit, Just right, Overfit
titles = ['Underfitting (Too Simple)', 'Good Fit', 'Overfitting (Too Complex)']
colors = ['#e85a4f', '#2a9d8f', '#e9c46a']
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
X_plot = np.linspace(0, 10, 100).reshape(-1, 1)
for ax, degree, title, color in zip(axes, degrees, titles, colors):
# Create polynomial features
poly = PolynomialFeatures(degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
X_plot_poly = poly.transform(X_plot)
# Fit model
model = LinearRegression()
model.fit(X_train_poly, y_train)
# Calculate errors
train_pred = model.predict(X_train_poly)
test_pred = model.predict(X_test_poly)
train_error = np.mean((train_pred - y_train)**2)
test_error = np.mean((test_pred - y_test)**2)
# Plot
ax.scatter(X_train, y_train, c='#1e3a5f', s=60, label='Train', edgecolors='white')
ax.scatter(X_test, y_test, c='#e85a4f', s=60, marker='s', label='Test', edgecolors='white')
ax.plot(X_plot, model.predict(X_plot_poly), color=color, linewidth=2, label=f'Degree {degree}')
ax.set_title(f'{title}\nTrain Error: {train_error:.3f}, Test Error: {test_error:.3f}', fontsize=11)
ax.set_xlabel('X')
ax.set_ylabel('y')
ax.legend(fontsize=9)
ax.set_ylim(-2, 2)
plt.tight_layout()
plt.show()
print("💡 Key insight:")
print(" Underfit: High train error, High test error")
print(" Overfit: Low train error, HIGH test error!")
print(" Good fit: Low train error, Low test error")Part 2: Bias-Variance Tradeoff
# Show how train and test error change with model complexity
degrees = range(1, 16)
train_errors = []
test_errors = []
for degree in degrees:
poly = PolynomialFeatures(degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
model = Ridge(alpha=0.01) # Small regularization for stability
model.fit(X_train_poly, y_train)
train_errors.append(np.mean((model.predict(X_train_poly) - y_train)**2))
test_errors.append(np.mean((model.predict(X_test_poly) - y_test)**2))
plt.figure(figsize=(10, 5))
plt.plot(degrees, train_errors, 'b-o', label='Training Error', linewidth=2, markersize=8)
plt.plot(degrees, test_errors, 'r-s', label='Test Error', linewidth=2, markersize=8)
# Mark the sweet spot
best_degree = degrees[np.argmin(test_errors)]
plt.axvline(x=best_degree, color='green', linestyle='--', linewidth=2, label=f'Best (degree={best_degree})')
# Add regions
plt.fill_between([1, 4], 0, 1.5, alpha=0.1, color='blue', label='Underfitting zone')
plt.fill_between([8, 15], 0, 1.5, alpha=0.1, color='red', label='Overfitting zone')
plt.xlabel('Model Complexity (Polynomial Degree)', fontsize=12)
plt.ylabel('Mean Squared Error', fontsize=12)
plt.title('Bias-Variance Tradeoff', fontsize=14)
plt.legend(loc='upper right')
plt.ylim(0, 1.5)
plt.grid(True, alpha=0.3)
plt.show()
print("💡 The test error has a U-shape:")
print(" Too simple → High bias (underfitting)")
print(" Too complex → High variance (overfitting)")Part 3: Cross-Validation
More reliable than a single train/test split
# Visualize K-Fold Cross Validation
from sklearn.model_selection import KFold
n_samples = 20
n_folds = 5
kf = KFold(n_splits=n_folds)
fig, ax = plt.subplots(figsize=(12, 4))
for fold, (train_idx, val_idx) in enumerate(kf.split(range(n_samples))):
# Training samples
for idx in train_idx:
rect = plt.Rectangle((idx, fold), 0.9, 0.8, facecolor='#1e3a5f', edgecolor='white')
ax.add_patch(rect)
# Validation samples
for idx in val_idx:
rect = plt.Rectangle((idx, fold), 0.9, 0.8, facecolor='#e85a4f', edgecolor='white')
ax.add_patch(rect)
ax.set_xlim(-0.5, n_samples + 0.5)
ax.set_ylim(-0.5, n_folds + 0.5)
ax.set_xlabel('Sample Index', fontsize=12)
ax.set_ylabel('Fold', fontsize=12)
ax.set_yticks(range(n_folds))
ax.set_yticklabels([f'Fold {i+1}' for i in range(n_folds)])
ax.set_title('5-Fold Cross Validation', fontsize=14)
# Legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='#1e3a5f', label='Training'),
Patch(facecolor='#e85a4f', label='Validation')]
ax.legend(handles=legend_elements, loc='upper right')
plt.tight_layout()
plt.show()
print("💡 Every sample gets to be in the validation set exactly once!")# Use cross-validation in practice
from sklearn.tree import DecisionTreeClassifier
iris = load_iris()
X, y = iris.data, iris.target
# Compare different max_depth values
depths = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None]
cv_scores = []
for depth in depths:
tree = DecisionTreeClassifier(max_depth=depth, random_state=42)
scores = cross_val_score(tree, X, y, cv=5)
cv_scores.append({'depth': str(depth) if depth else 'None',
'mean': scores.mean(),
'std': scores.std()})
# Visualize
plt.figure(figsize=(10, 5))
depth_labels = [s['depth'] for s in cv_scores]
means = [s['mean'] for s in cv_scores]
stds = [s['std'] for s in cv_scores]
plt.bar(depth_labels, means, yerr=stds, capsize=5, color='#2a9d8f', edgecolor='white', linewidth=2)
plt.xlabel('Max Depth', fontsize=12)
plt.ylabel('Cross-Validation Accuracy', fontsize=12)
plt.title('Finding the Best Tree Depth with CV', fontsize=14)
plt.ylim(0.8, 1.0)
# Mark the best
best_idx = np.argmax(means)
plt.gca().patches[best_idx].set_facecolor('#1e3a5f')
plt.show()
print(f"\n🎯 Best max_depth: {depth_labels[best_idx]} with accuracy {means[best_idx]:.1%} (±{stds[best_idx]:.1%})")Part 4: Grid Search - Systematic Hyperparameter Tuning
from sklearn.svm import SVC
# Define parameter grid
param_grid = {
'C': [0.1, 1, 10],
'gamma': [0.01, 0.1, 1]
}
# Grid Search
svm = SVC(kernel='rbf')
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)
# Visualize results as heatmap
results = grid_search.cv_results_
scores = np.array(results['mean_test_score']).reshape(3, 3)
plt.figure(figsize=(8, 6))
im = plt.imshow(scores, cmap='YlGn', aspect='auto')
plt.colorbar(im, label='Accuracy')
# Labels
plt.xticks([0, 1, 2], param_grid['gamma'])
plt.yticks([0, 1, 2], param_grid['C'])
plt.xlabel('Gamma', fontsize=12)
plt.ylabel('C', fontsize=12)
plt.title('Grid Search Results', fontsize=14)
# Add text annotations
for i in range(3):
for j in range(3):
text = plt.text(j, i, f'{scores[i, j]:.2%}',
ha='center', va='center', color='black', fontsize=11)
plt.show()
print(f"\n🎯 Best parameters: {grid_search.best_params_}")
print(f" Best accuracy: {grid_search.best_score_:.1%}")Part 5: Ensemble Methods - The Power of Many
# Compare single tree vs Random Forest
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Single tree
single_tree = DecisionTreeClassifier(random_state=42)
single_tree.fit(X_train, y_train)
single_acc = single_tree.score(X_test, y_test)
# Random Forest (many trees)
forest = RandomForestClassifier(n_estimators=100, random_state=42)
forest.fit(X_train, y_train)
forest_acc = forest.score(X_test, y_test)
# Visualize
fig, ax = plt.subplots(figsize=(8, 5))
models = ['Single Tree', 'Random Forest\n(100 trees)']
accuracies = [single_acc, forest_acc]
colors = ['#e85a4f', '#2a9d8f']
bars = ax.bar(models, accuracies, color=colors, edgecolor='white', linewidth=2)
ax.set_ylabel('Test Accuracy', fontsize=12)
ax.set_title('Single Tree vs Random Forest', fontsize=14)
ax.set_ylim(0.8, 1.0)
# Add value labels
for bar, acc in zip(bars, accuracies):
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
f'{acc:.1%}', ha='center', fontsize=14, fontweight='bold')
plt.show()
print("💡 Random Forest combines many trees, each trained on different subsets!")
print(" The wisdom of crowds often beats a single expert.")# Feature importance from Random Forest
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 5))
plt.bar(range(4), importances[indices], color='#1e3a5f', edgecolor='white', linewidth=2)
plt.xticks(range(4), [iris.feature_names[i] for i in indices], rotation=15)
plt.xlabel('Feature', fontsize=12)
plt.ylabel('Importance', fontsize=12)
plt.title('Feature Importance (Random Forest)', fontsize=14)
plt.show()
print(f"\n💡 Most important feature: {iris.feature_names[indices[0]]}")🎯 Exercises
# Exercise 1: Try different numbers of trees (10, 50, 100, 500)
# How does accuracy change? Is there a point of diminishing returns?
# Exercise 2: Use GridSearchCV to find the best parameters for Random Forest
# Try: n_estimators=[10, 50, 100], max_depth=[3, 5, 10, None]
Summary
| Concept | What We Learned |
|---|---|
| Overfitting | Model too complex, memorizes noise |
| Underfitting | Model too simple, misses pattern |
| Cross-Validation | More reliable than single split |
| Grid Search | Systematic hyperparameter search |
| Random Forest | Many trees > single tree |