import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
Grid Search
Grid Search
Hyperparameter Tuning
Dataset creation
# Create a DataFrame for classification containing four real features and one binary target
= pd.DataFrame({
df 'feature1': np.random.randint(0, 100, 100),
'feature2': np.random.randint(0, 100, 100),
'feature3': np.random.randint(0, 100, 100),
'feature4': np.random.randint(0, 100, 100),
'target': np.random.randint(0, 2, 100)
feature1 | feature2 | feature3 | feature4 | target | |
0 | 29 | 14 | 66 | 83 | 1 |
1 | 68 | 70 | 87 | 72 | 1 |
2 | 42 | 5 | 40 | 67 | 1 |
3 | 2 | 54 | 79 | 0 | 1 |
4 | 81 | 36 | 35 | 75 | 0 |
= df[:50]
train_df = df[50:80] validation_df
= DecisionTreeClassifier()
dt 'feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])[[ dt
DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with
'feature1', 'feature2', 'feature3', 'feature4']], validation_df['target']) dt.score(validation_df[[
= DecisionTreeClassifier(criterion='entropy', max_depth=2)
dt 'feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])[['feature1', 'feature2', 'feature3', 'feature4']], validation_df['target']) dt.score(validation_df[[
= {'criterion': ['gini', 'entropy'],
hyperparams 'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],
'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]}
= {}
out for c in hyperparams['criterion']:
for d in hyperparams['max_depth']:
for s in hyperparams['min_samples_split']:
= DecisionTreeClassifier(criterion=c, max_depth=d, min_samples_split=s)
dt 'feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])[[= dt.score(validation_df[['feature1', 'feature2', 'feature3', 'feature4']], validation_df['target']) out[(c, d, s)]
= pd.Series(out)
hp_ser =False) hp_ser.sort_values(ascending
entropy 10 10 0.766667
7 7 0.766667
9 8 0.766667
8 10 0.766667
9 0.766667
gini 10 5 0.500000
8 3 0.500000
7 4 0.500000
3 0.500000
5 2 0.500000
Length: 162, dtype: float64
('entropy', 4, 6)
= DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_split=6) best_dt
80][['feature1', 'feature2', 'feature3', 'feature4']], df[:80]['target'])[:
DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_split=6)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with
DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_split=6)
80:][['feature1', 'feature2', 'feature3', 'feature4']], df[80:]['target']) best_dt.score(df[
Without using multiple nested loops
print(hyperparams.items(), len(hyperparams.items()))
dict_items([('criterion', ['gini', 'entropy']), ('max_depth', [2, 3, 4, 5, 6, 7, 8, 9, 10]), ('min_samples_split', [2, 3, 4, 5, 6, 7, 8, 9, 10])]) 3
def print_vec(x, y, z):
print(f"[{x} \n{y} \n{z}]")
*hyperparams.items()) print_vec(
[('criterion', ['gini', 'entropy'])
('max_depth', [2, 3, 4, 5, 6, 7, 8, 9, 10])
('min_samples_split', [2, 3, 4, 5, 6, 7, 8, 9, 10])]
[('criterion', 'max_depth', 'min_samples_split'),
(['gini', 'entropy'],
[2, 3, 4, 5, 6, 7, 8, 9, 10],
[2, 3, 4, 5, 6, 7, 8, 9, 10])]
= zip(*hyperparams.items()) keys, values
('criterion', 'max_depth', 'min_samples_split')
(['gini', 'entropy'],
[2, 3, 4, 5, 6, 7, 8, 9, 10],
[2, 3, 4, 5, 6, 7, 8, 9, 10])
import itertools
[('gini', 2, 2),
('gini', 3, 3),
('gini', 4, 4),
('gini', 5, 5),
('gini', 6, 6),
('gini', 7, 7),
('gini', 8, 8),
('gini', 9, 9),
('gini', 10, 10),
('entropy', 3, 2),
('entropy', 4, 3),
('entropy', 5, 4),
('entropy', 6, 5),
('entropy', 7, 6),
('entropy', 8, 7),
('entropy', 9, 8),
('entropy', 10, 9)]
= next(itertools.product(*values))
v print(v)
('gini', 2, 2)
*zip(keys, v)) print_vec(
[('criterion', 'gini')
('max_depth', 2)
('min_samples_split', 2)]
def print_dict(**kwargs):
**(dict(zip(keys, v)))) print_dict(
{'criterion': 'gini', 'max_depth': 2, 'min_samples_split': 2}
= {}
out for v in itertools.product(*values):
= dict(zip(keys, v))
params = DecisionTreeClassifier(**params)
dt'feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])[['criterion'], params['max_depth'], params['min_samples_split'])] = dt.score(validation_df[['feature1', 'feature2', 'feature3', 'feature4']], validation_df['target'])
=False) pd.Series(out).sort_values(ascending
entropy 10 10 0.766667
7 7 0.766667
9 8 0.766667
8 10 0.766667
9 0.766667
3 3 0.500000
4 0.500000
5 0.500000
gini 10 5 0.500000
3 0.500000
Length: 162, dtype: float64