import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifierGrid Search
Grid Search
Hyperparameter Tuning
Dataset creation
# Create a DataFrame for classification containing four real features and one binary target
df = pd.DataFrame({
'feature1': np.random.randint(0, 100, 100),
'feature2': np.random.randint(0, 100, 100),
'feature3': np.random.randint(0, 100, 100),
'feature4': np.random.randint(0, 100, 100),
'target': np.random.randint(0, 2, 100)
})df.head()| feature1 | feature2 | feature3 | feature4 | target | |
|---|---|---|---|---|---|
| 0 | 29 | 14 | 66 | 83 | 1 |
| 1 | 68 | 70 | 87 | 72 | 1 |
| 2 | 42 | 5 | 40 | 67 | 1 |
| 3 | 2 | 54 | 79 | 0 | 1 |
| 4 | 81 | 36 | 35 | 75 | 0 |
train_df = df[:50]
validation_df = df[50:80]dt = DecisionTreeClassifier()
dt.fit(train_df[['feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])
dtDecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
dt.score(validation_df[['feature1', 'feature2', 'feature3', 'feature4']], validation_df['target'])0.5333333333333333
dt = DecisionTreeClassifier(criterion='entropy', max_depth=2)
dt.fit(train_df[['feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])
dt.score(validation_df[['feature1', 'feature2', 'feature3', 'feature4']], validation_df['target'])0.5666666666666667
hyperparams = {'criterion': ['gini', 'entropy'],
'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],
'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]}out = {}
for c in hyperparams['criterion']:
for d in hyperparams['max_depth']:
for s in hyperparams['min_samples_split']:
dt = DecisionTreeClassifier(criterion=c, max_depth=d, min_samples_split=s)
dt.fit(train_df[['feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])
out[(c, d, s)] = dt.score(validation_df[['feature1', 'feature2', 'feature3', 'feature4']], validation_df['target'])hp_ser = pd.Series(out)
hp_ser.sort_values(ascending=False)entropy 10 10 0.766667
7 7 0.766667
9 8 0.766667
8 10 0.766667
9 0.766667
...
gini 10 5 0.500000
8 3 0.500000
7 4 0.500000
3 0.500000
5 2 0.500000
Length: 162, dtype: float64
hp_ser.idxmax()('entropy', 4, 6)
best_dt = DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_split=6)best_dt.fit(df[:80][['feature1', 'feature2', 'feature3', 'feature4']], df[:80]['target'])DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_split=6)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_split=6)
best_dt.score(df[80:][['feature1', 'feature2', 'feature3', 'feature4']], df[80:]['target'])0.45
Without using multiple nested loops
print(hyperparams.items(), len(hyperparams.items()))dict_items([('criterion', ['gini', 'entropy']), ('max_depth', [2, 3, 4, 5, 6, 7, 8, 9, 10]), ('min_samples_split', [2, 3, 4, 5, 6, 7, 8, 9, 10])]) 3
def print_vec(x, y, z):
print(f"[{x} \n{y} \n{z}]")
print_vec(*hyperparams.items())[('criterion', ['gini', 'entropy'])
('max_depth', [2, 3, 4, 5, 6, 7, 8, 9, 10])
('min_samples_split', [2, 3, 4, 5, 6, 7, 8, 9, 10])]
list(zip(*hyperparams.items()))[('criterion', 'max_depth', 'min_samples_split'),
(['gini', 'entropy'],
[2, 3, 4, 5, 6, 7, 8, 9, 10],
[2, 3, 4, 5, 6, 7, 8, 9, 10])]
keys, values = zip(*hyperparams.items())keys('criterion', 'max_depth', 'min_samples_split')
values(['gini', 'entropy'],
[2, 3, 4, 5, 6, 7, 8, 9, 10],
[2, 3, 4, 5, 6, 7, 8, 9, 10])
import itertools
list(itertools.product(*values))[::10][('gini', 2, 2),
('gini', 3, 3),
('gini', 4, 4),
('gini', 5, 5),
('gini', 6, 6),
('gini', 7, 7),
('gini', 8, 8),
('gini', 9, 9),
('gini', 10, 10),
('entropy', 3, 2),
('entropy', 4, 3),
('entropy', 5, 4),
('entropy', 6, 5),
('entropy', 7, 6),
('entropy', 8, 7),
('entropy', 9, 8),
('entropy', 10, 9)]
v = next(itertools.product(*values))
print(v)('gini', 2, 2)
print_vec(*zip(keys, v))[('criterion', 'gini')
('max_depth', 2)
('min_samples_split', 2)]
def print_dict(**kwargs):
print(kwargs)
print_dict(**(dict(zip(keys, v)))){'criterion': 'gini', 'max_depth': 2, 'min_samples_split': 2}
out = {}
for v in itertools.product(*values):
params = dict(zip(keys, v))
dt= DecisionTreeClassifier(**params)
dt.fit(train_df[['feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])
out[(params['criterion'], params['max_depth'], params['min_samples_split'])] = dt.score(validation_df[['feature1', 'feature2', 'feature3', 'feature4']], validation_df['target'])
pd.Series(out).sort_values(ascending=False)entropy 10 10 0.766667
7 7 0.766667
9 8 0.766667
8 10 0.766667
9 0.766667
...
3 3 0.500000
4 0.500000
5 0.500000
gini 10 5 0.500000
3 0.500000
Length: 162, dtype: float64