import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
Grid Search
Grid Search
Hyperparameter Tuning
Dataset creation
# Create a DataFrame for classification containing four real features and one binary target
= pd.DataFrame({
df 'feature1': np.random.randint(0, 100, 100),
'feature2': np.random.randint(0, 100, 100),
'feature3': np.random.randint(0, 100, 100),
'feature4': np.random.randint(0, 100, 100),
'target': np.random.randint(0, 2, 100)
})
df.head()
feature1 | feature2 | feature3 | feature4 | target | |
---|---|---|---|---|---|
0 | 29 | 14 | 66 | 83 | 1 |
1 | 68 | 70 | 87 | 72 | 1 |
2 | 42 | 5 | 40 | 67 | 1 |
3 | 2 | 54 | 79 | 0 | 1 |
4 | 81 | 36 | 35 | 75 | 0 |
= df[:50]
train_df = df[50:80] validation_df
= DecisionTreeClassifier()
dt 'feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])
dt.fit(train_df[[ dt
DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
'feature1', 'feature2', 'feature3', 'feature4']], validation_df['target']) dt.score(validation_df[[
0.5333333333333333
= DecisionTreeClassifier(criterion='entropy', max_depth=2)
dt 'feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])
dt.fit(train_df[['feature1', 'feature2', 'feature3', 'feature4']], validation_df['target']) dt.score(validation_df[[
0.5666666666666667
= {'criterion': ['gini', 'entropy'],
hyperparams 'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],
'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]}
= {}
out for c in hyperparams['criterion']:
for d in hyperparams['max_depth']:
for s in hyperparams['min_samples_split']:
= DecisionTreeClassifier(criterion=c, max_depth=d, min_samples_split=s)
dt 'feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])
dt.fit(train_df[[= dt.score(validation_df[['feature1', 'feature2', 'feature3', 'feature4']], validation_df['target']) out[(c, d, s)]
= pd.Series(out)
hp_ser =False) hp_ser.sort_values(ascending
entropy 10 10 0.766667
7 7 0.766667
9 8 0.766667
8 10 0.766667
9 0.766667
...
gini 10 5 0.500000
8 3 0.500000
7 4 0.500000
3 0.500000
5 2 0.500000
Length: 162, dtype: float64
hp_ser.idxmax()
('entropy', 4, 6)
= DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_split=6) best_dt
80][['feature1', 'feature2', 'feature3', 'feature4']], df[:80]['target']) best_dt.fit(df[:
DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_split=6)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_split=6)
80:][['feature1', 'feature2', 'feature3', 'feature4']], df[80:]['target']) best_dt.score(df[
0.45
Without using multiple nested loops
print(hyperparams.items(), len(hyperparams.items()))
dict_items([('criterion', ['gini', 'entropy']), ('max_depth', [2, 3, 4, 5, 6, 7, 8, 9, 10]), ('min_samples_split', [2, 3, 4, 5, 6, 7, 8, 9, 10])]) 3
def print_vec(x, y, z):
print(f"[{x} \n{y} \n{z}]")
*hyperparams.items()) print_vec(
[('criterion', ['gini', 'entropy'])
('max_depth', [2, 3, 4, 5, 6, 7, 8, 9, 10])
('min_samples_split', [2, 3, 4, 5, 6, 7, 8, 9, 10])]
list(zip(*hyperparams.items()))
[('criterion', 'max_depth', 'min_samples_split'),
(['gini', 'entropy'],
[2, 3, 4, 5, 6, 7, 8, 9, 10],
[2, 3, 4, 5, 6, 7, 8, 9, 10])]
= zip(*hyperparams.items()) keys, values
keys
('criterion', 'max_depth', 'min_samples_split')
values
(['gini', 'entropy'],
[2, 3, 4, 5, 6, 7, 8, 9, 10],
[2, 3, 4, 5, 6, 7, 8, 9, 10])
import itertools
list(itertools.product(*values))[::10]
[('gini', 2, 2),
('gini', 3, 3),
('gini', 4, 4),
('gini', 5, 5),
('gini', 6, 6),
('gini', 7, 7),
('gini', 8, 8),
('gini', 9, 9),
('gini', 10, 10),
('entropy', 3, 2),
('entropy', 4, 3),
('entropy', 5, 4),
('entropy', 6, 5),
('entropy', 7, 6),
('entropy', 8, 7),
('entropy', 9, 8),
('entropy', 10, 9)]
= next(itertools.product(*values))
v print(v)
('gini', 2, 2)
*zip(keys, v)) print_vec(
[('criterion', 'gini')
('max_depth', 2)
('min_samples_split', 2)]
def print_dict(**kwargs):
print(kwargs)
**(dict(zip(keys, v)))) print_dict(
{'criterion': 'gini', 'max_depth': 2, 'min_samples_split': 2}
= {}
out for v in itertools.product(*values):
= dict(zip(keys, v))
params = DecisionTreeClassifier(**params)
dt'feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])
dt.fit(train_df[['criterion'], params['max_depth'], params['min_samples_split'])] = dt.score(validation_df[['feature1', 'feature2', 'feature3', 'feature4']], validation_df['target'])
out[(params[
=False) pd.Series(out).sort_values(ascending
entropy 10 10 0.766667
7 7 0.766667
9 8 0.766667
8 10 0.766667
9 0.766667
...
3 3 0.500000
4 0.500000
5 0.500000
gini 10 5 0.500000
3 0.500000
Length: 162, dtype: float64