import numpy as np
20)
np.random.seed(import matplotlib.pyplot as plt
import pandas as pd
# Retina display
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
from latexify import latexify, format_axes
Hyperparams Tuning Strategies Experimentation
ML
Tutorial
MakeMoons Dataset
1.1 Fixed Train-Test (70:30) split ; No Tuning
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_moons
# Generate the dataset
= make_moons(n_samples=1000, noise=0.3, random_state=42)
X, y
# Split the data into training, validation, and test sets
= train_test_split(X, y, test_size=0.3, random_state=2) X_train, X_test, y_train, y_test
=5, fig_height=4)
latexify(fig_width0], X_train[:, 1], c=y_train, label='Train')
plt.scatter(X_train[:, 0], X_test[:, 1], c=y_test, marker='x', label='Test')
plt.scatter(X_test[:,
format_axes(plt.gca()) plt.legend()
len(X_train), len(X_test)
(700, 300)
#hyperparameters take their default values
= DecisionTreeClassifier(random_state=42)
dt_classifier
dt_classifier.fit(X_train, y_train)
# Make predictions on the test set
= dt_classifier.score(X_test, y_test)
test_accuracy print("Test set accuracy: {:.4f}".format(test_accuracy))
Test set accuracy: 0.8933
1.2 Multiple Random Train-Test splits
# Initialize an empty list to store the accuracy metrics
= []
accuracy_metrics = []
all_test_sets = []
all_predictions
= []
X_tests
# Perform 10 random train-test splits and repeat the fit
for _ in range(10):
# Split the data into training and test sets
= train_test_split(X, y, test_size=0.3, random_state=np.random.randint(100))
X_train, X_test, y_train, y_test
X_tests.append(X_test)# Create and fit the decision tree classifier
= DecisionTreeClassifier(random_state=42)
dt_classifier
dt_classifier.fit(X_train, y_train)
= dt_classifier.predict(X_test)
current_predictions
all_predictions.append(current_predictions)= np.mean(current_predictions == y_test)
current_accuracy
all_test_sets.append(y_test)
# Calculate the accuracy on the test set
= dt_classifier.score(X_test, y_test)
test_accuracy
# Append the accuracy to the list
accuracy_metrics.append(test_accuracy)
# Calculate the mean and standard deviation of the accuracy metrics
= np.mean(accuracy_metrics)
mean_accuracy = np.std(accuracy_metrics)
std_accuracy
# Print the mean and standard deviation
print("Mean accuracy: {:.4f}".format(mean_accuracy))
print("Standard deviation: {:.4f}".format(std_accuracy))
# Print minimum and maximum accuracies
print("Minimum accuracy: {:.4f}".format(min(accuracy_metrics)))
print("Maximum accuracy: {:.4f}".format(max(accuracy_metrics)))
Mean accuracy: 0.8800
Standard deviation: 0.0211
Minimum accuracy: 0.8400
Maximum accuracy: 0.9133
# Find number of unique element in X_tests
= len(np.unique(np.concatenate(X_tests), axis=0))
found_unique_test_samples print(f"Number of unique test samples: {found_unique_test_samples}")
print(f"Ideally we wanted {len(X)} unique test samples")
Number of unique test samples: 964
Ideally we wanted 1000 unique test samples
1.3 K-Fold Cross Validation
import numpy as np
# Define the number of folds (k)
= 5
k
# Initialize lists to store predictions and accuracies
= {}
predictions = []
accuracies
# Calculate the size of each fold
= len(X) // k
fold_size
# Perform k-fold cross-validation
for i in range(k):
# Split the data into training and test sets
= i * fold_size
test_start = (i + 1) * fold_size
test_end = X[test_start:test_end]
test_set = y[test_start:test_end]
test_labels
= np.concatenate((X[:test_start], X[test_end:]), axis=0)
training_set print(len(test_set))
= np.concatenate((y[:test_start], y[test_end:]), axis=0)
training_labels
# Train the model
= DecisionTreeClassifier(random_state=42)
dt_classifier
dt_classifier.fit(training_set, training_labels)
# Make predictions on the validation set
= dt_classifier.predict(test_set)
fold_predictions
# Calculate the accuracy of the fold
= np.mean(fold_predictions == test_labels)
fold_accuracy
# Store the predictions and accuracy of the fold
= fold_predictions
predictions[i]
accuracies.append(fold_accuracy)
# Print the predictions and accuracies of each fold
for i in range(k):
print("Fold {}: Accuracy: {:.4f}".format(i+1, accuracies[i]))
200
200
200
200
200
Fold 1: Accuracy: 0.8700
Fold 2: Accuracy: 0.8850
Fold 3: Accuracy: 0.9300
Fold 4: Accuracy: 0.8650
Fold 5: Accuracy: 0.8850
from cgi import test
from sklearn.model_selection import KFold
# Define the number of folds (k)
= 5
k
# Initialize lists to store predictions and accuracies
= {}
predictions = []
accuracies
# Create a KFold instance
= KFold(n_splits=k, shuffle=False)
kf
# Perform k-fold cross-validation
for i, (train_index, test_index) in enumerate(kf.split(X)):
# Split the data into training and test sets
= X[train_index], X[test_index]
training_set, test_set print(len(test_set))
= y[train_index], y[test_index]
training_labels, test_labels
# Train the model
= DecisionTreeClassifier(random_state=42)
dt_classifier
dt_classifier.fit(training_set, training_labels)
# Make predictions on the validation set
= dt_classifier.predict(test_set)
fold_predictions
# Calculate the accuracy of the fold
= np.mean(fold_predictions == test_labels)
fold_accuracy
# Store the predictions and accuracy of the fold
= fold_predictions
predictions[i]
accuracies.append(fold_accuracy)
# Print the predictions and accuracy of each fold
print("Fold {}: Accuracy: {:.4f}".format(i+1, fold_accuracy))
200
Fold 1: Accuracy: 0.8700
200
Fold 2: Accuracy: 0.8850
200
Fold 3: Accuracy: 0.9300
200
Fold 4: Accuracy: 0.8650
200
Fold 5: Accuracy: 0.8850
Micro and Macro Averaging
from sklearn.metrics import accuracy_score
# Method 1 for computing accuracy
= accuracy_score(y, np.concatenate(list(predictions.values())))
accuracy_1
# Calculate macro-averaged accuracy
= np.mean(accuracies)
accuracy_2
# Print the micro and macro averaged accuracy
print("Method 1 accuracy: {:.4f}".format(accuracy_1))
print("Method2 accuracy: {:.4f}".format(accuracy_2))
Method 1 accuracy: 0.8870
Method2 accuracy: 0.8870
2.1 Fixed Train-Test Split (hyperparameters tuned on Validation set)
2.1.1 Validation Set as fixed Subset of Training Set
len(X)
1000
# Step 1: Split the data into training, validation, and test sets
= train_test_split(X, y, test_size=0.3, random_state=42)
X_train_val, X_test, y_train_val, y_test = train_test_split(X_train_val, y_train_val, test_size=0.285, random_state=42) X_train, X_val, y_train, y_val
print("Number of training examples: {}".format(len(X_train)))
print("Number of validation examples: {}".format(len(X_val)))
print("Number of testing examples: {}".format(len(X_test)))
Number of training examples: 500
Number of validation examples: 200
Number of testing examples: 300
= {}
hyperparameters 'max_depth'] = [1,2,3,4,5,6,7,8,9,10]
hyperparameters['min_samples_split'] = [2,3,4,5,6,7,8]
hyperparameters['criteria_values'] = ['gini', 'entropy']
hyperparameters[
= 0
best_accuracy = {}
best_hyperparameters
= {}
out = 0
count for max_depth in hyperparameters['max_depth']:
for min_samples_split in hyperparameters['min_samples_split']:
for criterion in hyperparameters['criteria_values']:
# Create and fit the decision tree classifier with the current hyperparameters
= DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, criterion=criterion, random_state=42)
dt_classifier
dt_classifier.fit(X_train, y_train)
# Evaluate the performance on the validation set
= dt_classifier.score(X_val, y_val)
val_accuracy = {'max_depth': max_depth, 'min_samples_split': min_samples_split, 'criterion': criterion, 'val_accuracy': val_accuracy}
out[count] += 1 count
= pd.DataFrame(out).T
hparam_df hparam_df
max_depth | min_samples_split | criterion | val_accuracy | |
---|---|---|---|---|
0 | 1 | 2 | gini | 0.785 |
1 | 1 | 2 | entropy | 0.785 |
2 | 1 | 3 | gini | 0.785 |
3 | 1 | 3 | entropy | 0.785 |
4 | 1 | 4 | gini | 0.785 |
... | ... | ... | ... | ... |
135 | 10 | 6 | entropy | 0.895 |
136 | 10 | 7 | gini | 0.89 |
137 | 10 | 7 | entropy | 0.895 |
138 | 10 | 8 | gini | 0.885 |
139 | 10 | 8 | entropy | 0.895 |
140 rows × 4 columns
='val_accuracy', ascending=False).head(10) hparam_df.sort_values(by
max_depth | min_samples_split | criterion | val_accuracy | |
---|---|---|---|---|
76 | 6 | 5 | gini | 0.925 |
77 | 6 | 5 | entropy | 0.925 |
78 | 6 | 6 | gini | 0.925 |
79 | 6 | 6 | entropy | 0.925 |
80 | 6 | 7 | gini | 0.925 |
81 | 6 | 7 | entropy | 0.925 |
83 | 6 | 8 | entropy | 0.925 |
70 | 6 | 2 | gini | 0.92 |
82 | 6 | 8 | gini | 0.92 |
90 | 7 | 5 | gini | 0.915 |
# Ensure dtype of val_accuracy is float
'val_accuracy'] = hparam_df['val_accuracy'].astype(float) hparam_df[
= hparam_df.iloc[hparam_df['val_accuracy'].idxmax()]
best_hyperparameters_row = best_hyperparameters_row['val_accuracy']
best_accuracy = best_hyperparameters_row[['max_depth', 'min_samples_split', 'criterion']].to_dict() best_hyperparameters
best_hyperparameters
{'max_depth': 6, 'min_samples_split': 5, 'criterion': 'gini'}
# Evaluate the performance of the selected hyperparameter combination on the test set
= DecisionTreeClassifier(max_depth=best_hyperparameters['max_depth'],
dt_classifier =best_hyperparameters['min_samples_split'],
min_samples_split=best_hyperparameters['criterion'],
criterion=42)
random_state
dt_classifier.fit(X_train_val, y_train_val)= dt_classifier.score(X_test, y_test)
test_accuracy
print("Best Hyperparameters:", best_hyperparameters)
print("Validation Set accuracy: {:.4f}".format(best_accuracy))
print("Test Set accuracy: {:.4f}".format(test_accuracy))
Best Hyperparameters: {'max_depth': 6, 'min_samples_split': 5, 'criterion': 'gini'}
Validation Set accuracy: 0.9250
Test Set accuracy: 0.9067
Avoiding nested loops by using itertools.product
for max_depth in hyperparameters['max_depth']:
for min_samples_split in hyperparameters['min_samples_split']:
for criterion in hyperparameters['criteria_values']:
# Create and fit the decision tree classifier with the current hyperparameters
= DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, criterion=criterion, random_state=42)
dt_classifier
dt_classifier.fit(X_train, y_train)
# Evaluate the performance on the validation set
= dt_classifier.score(X_val, y_val)
val_accuracy = {'max_depth': max_depth, 'min_samples_split': min_samples_split, 'criterion': criterion, 'val_accuracy': val_accuracy}
out[count] += 1 count
from itertools import product
for max_depth, min_samples_split, criterion in product(hyperparameters['max_depth'], hyperparameters['min_samples_split'], hyperparameters['criteria_values']):
# Define the Decision Tree Classifier
= DecisionTreeClassifier(
dt_classifier =max_depth,
max_depth=min_samples_split,
min_samples_split=criterion,
criterion=42
random_state
) dt_classifier.fit(X_train, y_train)
2.1.2 Multiple random subsets of Training Set used as Validation Set
# Initialize a list to store the optimal hyperparameters for each validation set
= {}
optimal_hyperparameters = []
test_accuracies
# Set the number of subsets and iterations
= 5
num_subsets
# Make a pandas dataframe with columns as the hyperparameters, subset number, and validation accuracy
= pd.DataFrame(columns=['max_depth', 'min_samples_split', 'criterion', 'subset', 'validation accuracy'])
hyperparameters_df
# Iterate over the subsets
for i in range(num_subsets):
# Split the data into training and validation sets
= train_test_split(X_train_val, y_train_val, test_size=0.285, random_state=i)
X_train_subset, X_val_subset, y_train_subset, y_val_subset
# Initialize variables to store the best hyperparameters and accuracy for the current subset
= 0
best_accuracy = {}
best_hyperparameters
# Iterate over the hyperparameter values
for max_depth in hyperparameters['max_depth']:
for min_samples_split in hyperparameters['min_samples_split']:
for criterion in hyperparameters['criteria_values']:
# Initialize and train the model with the current hyperparameters
= DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, criterion=criterion, random_state=42)
dt_classifier
dt_classifier.fit(X_train_subset, y_train_subset)
# Evaluate the model on the validation set
= dt_classifier.score(X_val_subset, y_val_subset)
val_accuracy len(hyperparameters_df)] = [max_depth, min_samples_split, criterion, i+1, val_accuracy]
hyperparameters_df.loc[
# Update the best accuracy and hyperparameters
if val_accuracy > best_accuracy:
= val_accuracy
best_accuracy = {
best_hyperparameters 'max_depth': max_depth,
'min_samples_split': min_samples_split,
'criterion': criterion
}
= best_hyperparameters
optimal_hyperparameters[i]
# Evaluate the model with the best hyperparameters on the test set
= DecisionTreeClassifier(max_depth=best_hyperparameters['max_depth'], min_samples_split=best_hyperparameters['min_samples_split'], criterion=best_hyperparameters['criterion'], random_state=42)
dt_classifier
dt_classifier.fit(X_train_val, y_train_val)= dt_classifier.score(X_test, y_test)
test_accuracy
test_accuracies.append(test_accuracy)
print("Optimal hyperparameters for {} inner folds/validation sets".format(num_subsets))
print()
# Print the optimal hyperparameters for each validation set
for i in range(num_subsets):
print("Optimal hyperparameters for validation set {}: {}".format(i+1, optimal_hyperparameters[i]))
print("Test Accuracy for validation set {}: {:.4f}".format(i+1, test_accuracies[i]))
Optimal hyperparameters for 5 inner folds/validation sets
Optimal hyperparameters for validation set 1: {'max_depth': 7, 'min_samples_split': 6, 'criterion': 'entropy'}
Test Accuracy for validation set 1: 0.9000
Optimal hyperparameters for validation set 2: {'max_depth': 5, 'min_samples_split': 7, 'criterion': 'gini'}
Test Accuracy for validation set 2: 0.9033
Optimal hyperparameters for validation set 3: {'max_depth': 6, 'min_samples_split': 2, 'criterion': 'entropy'}
Test Accuracy for validation set 3: 0.9233
Optimal hyperparameters for validation set 4: {'max_depth': 7, 'min_samples_split': 4, 'criterion': 'entropy'}
Test Accuracy for validation set 4: 0.9000
Optimal hyperparameters for validation set 5: {'max_depth': 6, 'min_samples_split': 2, 'criterion': 'entropy'}
Test Accuracy for validation set 5: 0.9233
hyperparameters_df
max_depth | min_samples_split | criterion | subset | validation accuracy | |
---|---|---|---|---|---|
0 | 1 | 2 | gini | 1 | 0.790 |
1 | 1 | 2 | entropy | 1 | 0.790 |
2 | 1 | 3 | gini | 1 | 0.790 |
3 | 1 | 3 | entropy | 1 | 0.790 |
4 | 1 | 4 | gini | 1 | 0.790 |
... | ... | ... | ... | ... | ... |
695 | 10 | 6 | entropy | 5 | 0.900 |
696 | 10 | 7 | gini | 5 | 0.905 |
697 | 10 | 7 | entropy | 5 | 0.900 |
698 | 10 | 8 | gini | 5 | 0.905 |
699 | 10 | 8 | entropy | 5 | 0.900 |
700 rows × 5 columns
= hyperparameters_df.groupby(['max_depth', 'min_samples_split', 'criterion']).mean()['validation accuracy']
grouped_df grouped_df
max_depth min_samples_split criterion
1 2 entropy 0.769
gini 0.771
3 entropy 0.769
gini 0.771
4 entropy 0.769
...
10 6 gini 0.889
7 entropy 0.902
gini 0.894
8 entropy 0.904
gini 0.893
Name: validation accuracy, Length: 140, dtype: float64
=False).head(10) grouped_df.sort_values(ascending
max_depth min_samples_split criterion
6 7 entropy 0.914
8 entropy 0.914
7 7 entropy 0.912
6 6 entropy 0.912
7 8 entropy 0.912
6 entropy 0.910
6 4 entropy 0.910
5 entropy 0.910
7 4 entropy 0.909
5 entropy 0.909
Name: validation accuracy, dtype: float64
= grouped_df.idxmax()
optimal_hyperparams optimal_hyperparams
(6, 7, 'entropy')
= DecisionTreeClassifier(max_depth=optimal_hyperparams[0], min_samples_split=optimal_hyperparams[1], criterion=optimal_hyperparams[2], random_state=42)
df_classifier
df_classifier.fit(X_train_val, y_train_val)= df_classifier.score(X_test, y_test)
test_accuracy print("Test accuracy: {:.4f}".format(test_accuracy))
Test accuracy: 0.9233
2.2 Nested Cross-Validation
'max_depth'] = [1,2,3,4,5,6,7,8,9,10]
hyperparameters['min_samples_split'] = [2,3,4,5,6,7,8]
hyperparameters['criteria_values'] = ['gini', 'entropy'] hyperparameters[
= 5
num_outer_folds = 5
num_inner_folds
= KFold(n_splits=num_outer_folds, shuffle=False)
kf_outer = KFold(n_splits=num_inner_folds, shuffle=False)
kf_inner
# Initialize lists to store the accuracies for the outer and inner loops
= []
outer_loop_accuracies = []
inner_loop_accuracies
= {}
results= 0
outer_count = 0
overall_count # Iterate over the outer folds
for outer_train_index, outer_test_index in kf_outer.split(X):
# Split the data into outer training and test sets
= X[outer_train_index], X[outer_test_index]
X_outer_train, X_outer_test = y[outer_train_index], y[outer_test_index]
y_outer_train, y_outer_test
= 0
inner_count
for innner_train_index, inner_test_index in kf_inner.split(X_outer_train):
print("*****"*20)
print("Outer Fold {}, Inner Fold {}".format(outer_count+1, inner_count+1))
# Split the data into inner training and test sets
= X_outer_train[innner_train_index], X_outer_train[inner_test_index]
X_inner_train, X_inner_test = y_outer_train[innner_train_index], y_outer_train[inner_test_index]
y_inner_train, y_inner_test
for max_depth, min_samples_split, criterion in product(hyperparameters['max_depth'],
'min_samples_split'],
hyperparameters['criteria_values']):
hyperparameters[
#print(max_depth, min_samples_split, criterion)
# Initialize and train the model with the current hyperparameters
= DecisionTreeClassifier(max_depth=max_depth,
dt_classifier =min_samples_split,
min_samples_split=criterion, random_state=42)
criterion
dt_classifier.fit(X_inner_train, y_inner_train)
# Evaluate the model on the inner test set
= dt_classifier.score(X_inner_test, y_inner_test)
val_accuracy
= {'outer_fold': outer_count,
results[overall_count] 'inner_fold': inner_count,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'criterion': criterion,
'val_accuracy': val_accuracy}
+= 1
overall_count
+= 1
inner_count += 1
outer_count
****************************************************************************************************
Outer Fold 1, Inner Fold 1
****************************************************************************************************
Outer Fold 1, Inner Fold 2
****************************************************************************************************
Outer Fold 1, Inner Fold 3
****************************************************************************************************
Outer Fold 1, Inner Fold 4
****************************************************************************************************
Outer Fold 1, Inner Fold 5
****************************************************************************************************
Outer Fold 2, Inner Fold 1
****************************************************************************************************
Outer Fold 2, Inner Fold 2
****************************************************************************************************
Outer Fold 2, Inner Fold 3
****************************************************************************************************
Outer Fold 2, Inner Fold 4
****************************************************************************************************
Outer Fold 2, Inner Fold 5
****************************************************************************************************
Outer Fold 3, Inner Fold 1
****************************************************************************************************
Outer Fold 3, Inner Fold 2
****************************************************************************************************
Outer Fold 3, Inner Fold 3
****************************************************************************************************
Outer Fold 3, Inner Fold 4
****************************************************************************************************
Outer Fold 3, Inner Fold 5
****************************************************************************************************
Outer Fold 4, Inner Fold 1
****************************************************************************************************
Outer Fold 4, Inner Fold 2
****************************************************************************************************
Outer Fold 4, Inner Fold 3
****************************************************************************************************
Outer Fold 4, Inner Fold 4
****************************************************************************************************
Outer Fold 4, Inner Fold 5
****************************************************************************************************
Outer Fold 5, Inner Fold 1
****************************************************************************************************
Outer Fold 5, Inner Fold 2
****************************************************************************************************
Outer Fold 5, Inner Fold 3
****************************************************************************************************
Outer Fold 5, Inner Fold 4
****************************************************************************************************
Outer Fold 5, Inner Fold 5
= pd.DataFrame(results).T overall_results
overall_results
outer_fold | inner_fold | max_depth | min_samples_split | criterion | val_accuracy | |
---|---|---|---|---|---|---|
0 | 0 | 0 | 1 | 2 | gini | 0.7625 |
1 | 0 | 0 | 1 | 2 | entropy | 0.7625 |
2 | 0 | 0 | 1 | 3 | gini | 0.7625 |
3 | 0 | 0 | 1 | 3 | entropy | 0.7625 |
4 | 0 | 0 | 1 | 4 | gini | 0.7625 |
... | ... | ... | ... | ... | ... | ... |
3495 | 4 | 4 | 10 | 6 | entropy | 0.9 |
3496 | 4 | 4 | 10 | 7 | gini | 0.91875 |
3497 | 4 | 4 | 10 | 7 | entropy | 0.9 |
3498 | 4 | 4 | 10 | 8 | gini | 0.925 |
3499 | 4 | 4 | 10 | 8 | entropy | 0.9 |
3500 rows × 6 columns
Find the best hyperparameters for each outer fold
= 0
outer_fold = overall_results.query('outer_fold == @outer_fold')
outer_fold_df outer_fold_df
outer_fold | inner_fold | max_depth | min_samples_split | criterion | val_accuracy | |
---|---|---|---|---|---|---|
0 | 0 | 0 | 1 | 2 | gini | 0.7625 |
1 | 0 | 0 | 1 | 2 | entropy | 0.7625 |
2 | 0 | 0 | 1 | 3 | gini | 0.7625 |
3 | 0 | 0 | 1 | 3 | entropy | 0.7625 |
4 | 0 | 0 | 1 | 4 | gini | 0.7625 |
... | ... | ... | ... | ... | ... | ... |
695 | 0 | 4 | 10 | 6 | entropy | 0.85 |
696 | 0 | 4 | 10 | 7 | gini | 0.86875 |
697 | 0 | 4 | 10 | 7 | entropy | 0.85625 |
698 | 0 | 4 | 10 | 8 | gini | 0.86875 |
699 | 0 | 4 | 10 | 8 | entropy | 0.85625 |
700 rows × 6 columns
Aggregate the validation accuracies for each hyperparameter combination across all inner folds
'max_depth', 'min_samples_split', 'criterion']).mean()['val_accuracy'].sort_values(ascending=False).head(10) outer_fold_df.groupby([
max_depth min_samples_split criterion
6 7 gini 0.9175
8 gini 0.9175
6 gini 0.9175
4 gini 0.91625
3 gini 0.91625
2 gini 0.91625
5 gini 0.91625
7 6 gini 0.91625
7 gini 0.91625
8 gini 0.915
Name: val_accuracy, dtype: object