import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Retina display
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
from latexify import latexify, format_axes
Hyperparameter Tuning
ML
Tutorial
MakeMoons Dataset
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_moons
# Generate the dataset
= make_moons(n_samples=1000, noise=0.3, random_state=42)
X, y
# Split the data into training, validation, and test sets
= train_test_split(X, y, test_size=0.4, random_state=42)
X_train, X_temp, y_train, y_temp = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42) X_val, X_test, y_val, y_test
=5, fig_height=4)
latexify(fig_width0], X_train[:, 1], c=y_train, label='Train')
plt.scatter(X_train[:,
format_axes(plt.gca()) plt.show()
#Define the hyperparameters' possible values
= [1,2,3,4,5,6,7,8,9,10]
max_depth_values = [2,3,4,5,6,7,8]
min_samples_split_values = ['gini', 'entropy'] criteria_values
Nested For Loops
= 0
best_accuracy = {}
best_params
for max_depth in max_depth_values:
for min_samples_split in min_samples_split_values:
for criterion in criteria_values:
# Define the Decision Tree Classifier
= DecisionTreeClassifier(
dt_classifier =max_depth,
max_depth=min_samples_split,
min_samples_split=criterion,
criterion=42
random_state
)
dt_classifier.fit(X_train, y_train)
# Evaluate on the validation set
= dt_classifier.score(X_val, y_val)
val_accuracy
# Check if this combination gives a better accuracy
if val_accuracy > best_accuracy:
= val_accuracy
best_accuracy = {
best_params 'max_depth': max_depth,
'min_samples_split': min_samples_split,
'criterion': criterion
}
# Print the best hyperparameters
print("Best Hyperparameters:", best_params)
print("Best Validation Accuracy:", best_accuracy)
# Train the model with the best hyperparameters
= DecisionTreeClassifier(**best_params)
best_dt_classifier
best_dt_classifier.fit(X_train, y_train)
# Evaluate on the test set
= best_dt_classifier.score(X_test, y_test)
test_accuracy print(f"Test Accuracy: {test_accuracy:.4f}")
Best Hyperparameters: {'max_depth': 7, 'min_samples_split': 2, 'criterion': 'entropy'}
Best Validation Accuracy: 0.925
Test Accuracy: 0.8950
Using Itertools
from itertools import product
= 0
best_accuracy = {}
best_params
# Use itertools.product for a more succinct code
for max_depth, min_samples_split, criterion in product(max_depth_values, min_samples_split_values, criteria_values):
# Define the Decision Tree Classifier
= DecisionTreeClassifier(
dt_classifier =max_depth,
max_depth=min_samples_split,
min_samples_split=criterion,
criterion=42
random_state
)
dt_classifier.fit(X_train, y_train)
# Evaluate on the validation set
= dt_classifier.score(X_val, y_val)
val_accuracy
# Check if this combination gives a better accuracy
if val_accuracy > best_accuracy:
= val_accuracy
best_accuracy = {
best_params 'max_depth': max_depth,
'min_samples_split': min_samples_split,
'criterion': criterion
}
# Print the best hyperparameters
print("Best Hyperparameters:", best_params)
print("Best Validation Accuracy:", best_accuracy)
# Train the model with the best hyperparameters
= DecisionTreeClassifier(**best_params)
best_dt_classifier
best_dt_classifier.fit(X_train, y_train)
# Evaluate on the test set
= best_dt_classifier.score(X_test, y_test)
test_accuracy print(f"Test Accuracy: {test_accuracy:.4f}")
Best Hyperparameters: {'max_depth': 7, 'min_samples_split': 2, 'criterion': 'entropy'}
Best Validation Accuracy: 0.925
Test Accuracy: 0.8950
Using Sklearn Grid Search (5 fold Cross-Validation)
from sklearn.model_selection import GridSearchCV
# Define the Decision Tree Classifier
= DecisionTreeClassifier(random_state=42)
dt_classifier
# Define the hyperparameters to tune
= {
param_grid 'max_depth': max_depth_values,
'min_samples_split': min_samples_split_values,
'criterion': criteria_values
}
= np.concatenate([X_train, X_val], axis=0)
X_train_val = np.concatenate([y_train, y_val], axis=0)
y_train_val
# Use GridSearchCV for hyperparameter tuning
= 5
num_inner_folds = GridSearchCV(dt_classifier, param_grid, scoring='accuracy', cv=num_inner_folds)
grid_search
grid_search.fit(X_train_val, y_train_val)
# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)
# Evaluate on the test set
= grid_search.best_estimator_.score(X_test, y_test)
test_accuracy print(f"Test Accuracy: {test_accuracy:.4f}")
Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 8, 'min_samples_split': 8}
Test Accuracy: 0.9000