Week 7 Lab: From Models to Experiments

CS 203: Software Tools and Techniques for AI

In this lab, you’ll: 1. See why single train/test splits are unreliable 2. Use cross-validation properly 3. Plot learning curves and validation curves 4. Compare grid search vs random search 5. Use Optuna for Bayesian hyperparameter optimization 6. Understand nested cross-validation 7. Try AutoML with AutoGluon

Setup

# Install required packages (uncomment if needed)
# !pip install pandas scikit-learn matplotlib seaborn optuna plotly

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (
    train_test_split, cross_val_score, GridSearchCV,
    RandomizedSearchCV, StratifiedKFold, learning_curve,
    validation_curve
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from scipy.stats import randint, uniform
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
print("All imports successful!")

Create the Movie Dataset

Synthetic dataset similar to what we’ve built over weeks 1-5.

np.random.seed(42)
n_samples = 800

genres = np.random.choice(['Action', 'Comedy', 'Drama', 'Horror', 'Sci-Fi'], n_samples)
budgets = np.random.uniform(5, 300, n_samples)
runtimes = np.random.uniform(80, 180, n_samples)
is_sequel = np.random.choice([0, 1], n_samples, p=[0.7, 0.3])
star_power = np.random.uniform(1, 10, n_samples)
release_month = np.random.randint(1, 13, n_samples)

# Success depends on features with some non-linearity
success_prob = (
    0.3
    + 0.002 * budgets
    + 0.03 * star_power
    + 0.1 * is_sequel
    + np.where(genres == 'Action', 0.1, 0)
    + np.where(release_month.astype(int) == 6, 0.08, 0)  # summer bump
    + np.random.normal(0, 0.12, n_samples)
)
success_prob = np.clip(success_prob, 0, 1)
success = (np.random.random(n_samples) < success_prob).astype(int)

movies = pd.DataFrame({
    'genre': genres, 'budget': budgets, 'runtime': runtimes,
    'is_sequel': is_sequel, 'star_power': star_power,
    'release_month': release_month, 'success': success
})

# Encode genre
le = LabelEncoder()
movies['genre_encoded'] = le.fit_transform(movies['genre'])

feature_cols = ['genre_encoded', 'budget', 'runtime', 'is_sequel', 'star_power', 'release_month']
X = movies[feature_cols].values
y = movies['success'].values

print(f"Dataset: {X.shape[0]} samples, {X.shape[1]} features")
print(f"Success rate: {y.mean():.1%}")

Part 1: Why Single Splits Are Unreliable

Run the same model with different random splits and see how much the accuracy varies.

# Run 20 different random splits and record the accuracy each time
single_split_scores = []

for seed in range(20):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed
    )
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    acc = model.score(X_test, y_test)
    single_split_scores.append(acc)

single_split_scores = np.array(single_split_scores)

print("Accuracy from 20 different random splits:")
print(f"  Min:  {single_split_scores.min():.1%}")
print(f"  Max:  {single_split_scores.max():.1%}")
print(f"  Mean: {single_split_scores.mean():.1%}")
print(f"  Std:  {single_split_scores.std():.1%}")
print(f"  Range: {(single_split_scores.max() - single_split_scores.min())*100:.1f} percentage points!")

# Visualize the variance
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left: individual scores
axes[0].bar(range(20), single_split_scores, color='steelblue', alpha=0.7)
axes[0].axhline(y=single_split_scores.mean(), color='red', linestyle='--', label=f'Mean: {single_split_scores.mean():.1%}')
axes[0].set_xlabel('Random Split #')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Same Model, Different Splits')
axes[0].legend()

# Right: histogram
axes[1].hist(single_split_scores, bins=10, color='steelblue', alpha=0.7, edgecolor='black')
axes[1].axvline(x=single_split_scores.mean(), color='red', linestyle='--', label=f'Mean: {single_split_scores.mean():.1%}')
axes[1].set_xlabel('Accuracy')
axes[1].set_ylabel('Count')
axes[1].set_title('Distribution of Accuracy Across Splits')
axes[1].legend()

plt.tight_layout()
plt.show()

print("\nWould you trust any single number from this distribution?")

Part 2: Cross-Validation

Now let’s use 5-fold CV to get a reliable estimate.

# 5-fold cross-validation
model = RandomForestClassifier(n_estimators=100, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=5)

print("5-Fold Cross-Validation:")
print(f"  Fold scores: {cv_scores}")
print(f"  Mean: {cv_scores.mean():.3f}")
print(f"  Std:  {cv_scores.std():.3f}")
print(f"\nReport as: {cv_scores.mean():.1%} +/- {cv_scores.std():.1%}")

# Compare models with CV
models = {
    'Dummy (majority)': DummyClassifier(strategy='most_frequent'),
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(max_depth=5, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
}

results = []
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5)
    results.append({'Model': name, 'Mean': scores.mean(), 'Std': scores.std()})
    print(f"{name:25s}  {scores.mean():.1%} +/- {scores.std():.1%}")

results_df = pd.DataFrame(results)

# Visualize with error bars
fig, ax = plt.subplots(figsize=(10, 5))
colors = ['gray'] + ['steelblue'] * 3
ax.barh(results_df['Model'], results_df['Mean'], xerr=results_df['Std'],
        capsize=5, color=colors, alpha=0.8)
ax.axvline(x=results_df.iloc[0]['Mean'], color='red', linestyle='--',
           alpha=0.5, label='Baseline')
ax.set_xlabel('Accuracy (5-fold CV)')
ax.set_title('Model Comparison with Cross-Validation')
ax.legend()
plt.tight_layout()
plt.show()

Data Leakage Demo

What happens if you preprocess BEFORE cross-validation?

# WRONG way: scale all data first, then CV
scaler = StandardScaler()
X_scaled_wrong = scaler.fit_transform(X)  # Leakage! Scaler sees test data
scores_wrong = cross_val_score(LogisticRegression(), X_scaled_wrong, y, cv=5)

# RIGHT way: use Pipeline (scaler fits only on training fold)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression())
])
scores_right = cross_val_score(pipe, X, y, cv=5)

print("Data Leakage Demo:")
print(f"  With leakage (wrong):   {scores_wrong.mean():.4f} +/- {scores_wrong.std():.4f}")
print(f"  Without leakage (right): {scores_right.mean():.4f} +/- {scores_right.std():.4f}")
print(f"\nDifference: {(scores_wrong.mean() - scores_right.mean())*100:.2f} percentage points")
print("The leaky version is slightly optimistic -- it 'cheats' by peeking at test data.")

Part 3: Learning Curves and Validation Curves

# Learning Curve: does more data help?
train_sizes, train_scores, val_scores = learning_curve(
    RandomForestClassifier(n_estimators=100, random_state=42),
    X, y,
    train_sizes=np.linspace(0.1, 1.0, 10),
    cv=5,
    n_jobs=-1
)

fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(train_sizes, train_scores.mean(axis=1), 'o-', label='Training Score', color='steelblue')
ax.fill_between(train_sizes,
                train_scores.mean(axis=1) - train_scores.std(axis=1),
                train_scores.mean(axis=1) + train_scores.std(axis=1),
                alpha=0.15, color='steelblue')

ax.plot(train_sizes, val_scores.mean(axis=1), 'o-', label='Validation Score', color='orange')
ax.fill_between(train_sizes,
                val_scores.mean(axis=1) - val_scores.std(axis=1),
                val_scores.mean(axis=1) + val_scores.std(axis=1),
                alpha=0.15, color='orange')

ax.set_xlabel('Training Set Size')
ax.set_ylabel('Accuracy')
ax.set_title('Learning Curve: Random Forest')
ax.legend()
ax.set_ylim(0.6, 1.05)
plt.tight_layout()
plt.show()

gap = train_scores.mean(axis=1)[-1] - val_scores.mean(axis=1)[-1]
print(f"Gap at full data: {gap:.1%}")
print("A large gap means overfitting; more data might help.")
print("If both curves are flat and low, the model is underfitting.")

# Validation Curve: what's the best max_depth?
param_range = [1, 2, 3, 5, 7, 10, 15, 20, 30, None]
# None means unlimited, we'll handle it separately
param_range_numeric = [1, 2, 3, 5, 7, 10, 15, 20, 30, 50]

train_scores_vc, val_scores_vc = validation_curve(
    RandomForestClassifier(n_estimators=100, random_state=42),
    X, y,
    param_name='max_depth',
    param_range=param_range_numeric,
    cv=5,
    n_jobs=-1
)

fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(param_range_numeric, train_scores_vc.mean(axis=1), 'o-', label='Training Score', color='steelblue')
ax.fill_between(param_range_numeric,
                train_scores_vc.mean(axis=1) - train_scores_vc.std(axis=1),
                train_scores_vc.mean(axis=1) + train_scores_vc.std(axis=1),
                alpha=0.15, color='steelblue')

ax.plot(param_range_numeric, val_scores_vc.mean(axis=1), 'o-', label='Validation Score', color='orange')
ax.fill_between(param_range_numeric,
                val_scores_vc.mean(axis=1) - val_scores_vc.std(axis=1),
                val_scores_vc.mean(axis=1) + val_scores_vc.std(axis=1),
                alpha=0.15, color='orange')

best_idx = val_scores_vc.mean(axis=1).argmax()
ax.axvline(x=param_range_numeric[best_idx], color='green', linestyle='--',
           label=f'Best: max_depth={param_range_numeric[best_idx]}')

ax.set_xlabel('max_depth')
ax.set_ylabel('Accuracy')
ax.set_title('Validation Curve: Finding the Best max_depth')
ax.legend()
ax.set_ylim(0.6, 1.05)
plt.tight_layout()
plt.show()

print(f"Best max_depth: {param_range_numeric[best_idx]}")
print(f"Validation accuracy at best: {val_scores_vc.mean(axis=1)[best_idx]:.1%}")
print("\nLeft of peak = underfitting, right of peak = overfitting")

Part 4: Grid Search vs Random Search

Compare both approaches with the same compute budget.

%%time

# Grid Search: 3 x 4 x 3 = 36 combinations
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_leaf': [1, 2, 5]
}

grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid, cv=5, scoring='accuracy', n_jobs=-1
)
grid.fit(X, y)

print(f"Grid Search (36 combinations x 5 folds = 180 fits):")
print(f"  Best score:  {grid.best_score_:.4f}")
print(f"  Best params: {grid.best_params_}")

%%time

# Random Search: 36 random combinations (same budget as grid)
param_distributions = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(3, 30),
    'min_samples_leaf': randint(1, 20),
    'max_features': uniform(0.1, 0.9),  # Extra param! Grid can't afford this
}

random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions,
    n_iter=36,  # Same budget as grid search
    cv=5, scoring='accuracy', random_state=42, n_jobs=-1
)
random_search.fit(X, y)

print(f"Random Search (36 random combos x 5 folds = 180 fits):")
print(f"  Best score:  {random_search.best_score_:.4f}")
print(f"  Best params: {random_search.best_params_}")

# Compare
print("\n" + "="*50)
print("COMPARISON (same compute budget: 180 fits)")
print("="*50)
print(f"Grid Search:   {grid.best_score_:.4f}  (searched 3 hyperparameters)")
print(f"Random Search: {random_search.best_score_:.4f}  (searched 4 hyperparameters!)")
print(f"\nRandom search explored max_features too -- grid couldn't afford it.")

Part 5: Optuna – Bayesian Optimization

Optuna learns from previous trials to make smarter choices.

try:
    import optuna
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    OPTUNA_AVAILABLE = True
    print("Optuna is available!")
except ImportError:
    OPTUNA_AVAILABLE = False
    print("Install optuna: pip install optuna")

if OPTUNA_AVAILABLE:
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 500),
            'max_depth': trial.suggest_int('max_depth', 3, 30),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
            'max_features': trial.suggest_float('max_features', 0.1, 1.0),
        }
        model = RandomForestClassifier(**params, random_state=42)
        scores = cross_val_score(model, X, y, cv=5)
        return scores.mean()

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50, show_progress_bar=True)

    print(f"\nBest score: {study.best_value:.4f}")
    print(f"Best params: {study.best_params}")
else:
    print("Skipping Optuna section.")

if OPTUNA_AVAILABLE:
    # Optimization history: how scores improve over trials
    fig = optuna.visualization.plot_optimization_history(study)
    fig.show()
else:
    print("Skipping.")

if OPTUNA_AVAILABLE:
    # Which hyperparameters matter most?
    fig = optuna.visualization.plot_param_importances(study)
    fig.show()
else:
    print("Skipping.")

# Compare all tuning approaches
print("\n" + "="*50)
print("ALL TUNING APPROACHES COMPARED")
print("="*50)
print(f"No tuning (default RF):  {cross_val_score(RandomForestClassifier(random_state=42), X, y, cv=5).mean():.4f}")
print(f"Grid Search (36 trials): {grid.best_score_:.4f}")
print(f"Random Search (36 trials): {random_search.best_score_:.4f}")
if OPTUNA_AVAILABLE:
    print(f"Optuna (50 trials):      {study.best_value:.4f}")

Part 6: Nested Cross-Validation

The best_score_ from GridSearchCV is optimistically biased. Nested CV gives an honest estimate.

# Inner loop: tune hyperparameters
inner_cv = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid={'max_depth': [5, 10, 15], 'n_estimators': [100, 200]},
    cv=3,  # 3-fold inner CV for tuning
    n_jobs=-1
)

# Outer loop: evaluate the tuned model on truly held-out data
outer_scores = cross_val_score(inner_cv, X, y, cv=5)  # 5-fold outer CV

# Compare
inner_cv.fit(X, y)  # Fit to get best_score_

print("Nested CV vs GridSearchCV.best_score_:")
print(f"  GridSearchCV.best_score_ (optimistic): {inner_cv.best_score_:.4f}")
print(f"  Nested CV (honest):                    {outer_scores.mean():.4f} +/- {outer_scores.std():.4f}")
print(f"  Optimism gap:                          {(inner_cv.best_score_ - outer_scores.mean())*100:.2f} percentage points")
print(f"\nAlways use nested CV when reporting tuned model performance!")

Part 7: AutoML with AutoGluon (Optional)

AutoML automates everything: model selection, tuning, and ensembling.

# Uncomment to install: !pip install autogluon
try:
    from autogluon.tabular import TabularPredictor
    AUTOGLUON_AVAILABLE = True
    print("AutoGluon is available!")
except ImportError:
    AUTOGLUON_AVAILABLE = False
    print("AutoGluon not installed. Install with: pip install autogluon")

if AUTOGLUON_AVAILABLE:
    # Prepare data (AutoGluon wants a DataFrame with the label column included)
    train_ag = movies[['genre', 'budget', 'runtime', 'is_sequel',
                       'star_power', 'release_month', 'success']]

    predictor = TabularPredictor(label='success', eval_metric='accuracy')
    predictor.fit(train_ag, time_limit=120)  # 2 minutes

    print("\nLeaderboard:")
    print(predictor.leaderboard())
else:
    print("Skipping AutoGluon. Install with: pip install autogluon")

Summary

print("="*60)
print("FINAL SUMMARY")
print("="*60)

baseline = cross_val_score(DummyClassifier(strategy='most_frequent'), X, y, cv=5).mean()
lr_score = cross_val_score(LogisticRegression(random_state=42), X, y, cv=5).mean()
rf_score = cross_val_score(RandomForestClassifier(random_state=42), X, y, cv=5).mean()

print(f"\nDummy baseline:     {baseline:.1%}")
print(f"Logistic Regression: {lr_score:.1%}")
print(f"Random Forest:       {rf_score:.1%}")
print(f"Tuned RF (grid):     {grid.best_score_:.1%}  (optimistic!)")
print(f"Tuned RF (nested):   {outer_scores.mean():.1%}  (honest)")
if OPTUNA_AVAILABLE:
    print(f"Optuna best:         {study.best_value:.1%}")

print(f"\n" + "="*60)
print("Key lessons:")
print("1. Single splits are unreliable -- always use CV")
print("2. Validation curves show the best hyperparameter value")
print("3. Random search beats grid search for the same budget")
print("4. Optuna is smarter -- it learns from previous trials")
print("5. best_score_ is optimistic -- use nested CV for honest evaluation")
print("6. AutoML automates everything but gives up interpretability")

Exercises

Stratified CV: Use StratifiedKFold explicitly and compare with default cross_val_score
More models: Add XGBoost or LightGBM to the comparison
Optuna for LR: Tune C and penalty for Logistic Regression using Optuna
Learning curve for DT: Plot a learning curve for Decision Tree – does it overfit?
AutoGluon presets: Try medium_quality vs good_quality – how much does extra time help?