Grid Search

Grid Search
Author

Nipun Batra

Published

January 17, 2023

Hyperparameter Tuning

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

Dataset creation

# Create a DataFrame for classification containing four real features and one binary target

df = pd.DataFrame({
    'feature1': np.random.randint(0, 100, 100),
    'feature2': np.random.randint(0, 100, 100),
    'feature3': np.random.randint(0, 100, 100),
    'feature4': np.random.randint(0, 100, 100),
    'target': np.random.randint(0, 2, 100)
})
df.head()
feature1 feature2 feature3 feature4 target
0 29 14 66 83 1
1 68 70 87 72 1
2 42 5 40 67 1
3 2 54 79 0 1
4 81 36 35 75 0
train_df = df[:50]
validation_df = df[50:80]
dt = DecisionTreeClassifier()
dt.fit(train_df[['feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])
dt
DecisionTreeClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
dt.score(validation_df[['feature1', 'feature2', 'feature3', 'feature4']], validation_df['target'])
0.5333333333333333
dt = DecisionTreeClassifier(criterion='entropy', max_depth=2)
dt.fit(train_df[['feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])
dt.score(validation_df[['feature1', 'feature2', 'feature3', 'feature4']], validation_df['target'])
0.5666666666666667
hyperparams = {'criterion': ['gini', 'entropy'],
               'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],
               'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]}
out = {}
for c in hyperparams['criterion']:
    for d in hyperparams['max_depth']:
        for s in hyperparams['min_samples_split']:
            dt = DecisionTreeClassifier(criterion=c, max_depth=d, min_samples_split=s)
            dt.fit(train_df[['feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])
            out[(c, d, s)] = dt.score(validation_df[['feature1', 'feature2', 'feature3', 'feature4']], validation_df['target'])
hp_ser = pd.Series(out)
hp_ser.sort_values(ascending=False)
entropy  10  10    0.766667
         7   7     0.766667
         9   8     0.766667
         8   10    0.766667
             9     0.766667
                     ...   
gini     10  5     0.500000
         8   3     0.500000
         7   4     0.500000
             3     0.500000
         5   2     0.500000
Length: 162, dtype: float64
hp_ser.idxmax()
('entropy', 4, 6)
best_dt = DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_split=6)
best_dt.fit(df[:80][['feature1', 'feature2', 'feature3', 'feature4']], df[:80]['target'])
DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_split=6)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
best_dt.score(df[80:][['feature1', 'feature2', 'feature3', 'feature4']], df[80:]['target'])
0.45

Without using multiple nested loops

print(hyperparams.items(), len(hyperparams.items()))
dict_items([('criterion', ['gini', 'entropy']), ('max_depth', [2, 3, 4, 5, 6, 7, 8, 9, 10]), ('min_samples_split', [2, 3, 4, 5, 6, 7, 8, 9, 10])]) 3
def print_vec(x, y, z):
    print(f"[{x} \n{y} \n{z}]")
print_vec(*hyperparams.items())
[('criterion', ['gini', 'entropy']) 
('max_depth', [2, 3, 4, 5, 6, 7, 8, 9, 10]) 
('min_samples_split', [2, 3, 4, 5, 6, 7, 8, 9, 10])]
list(zip(*hyperparams.items()))
[('criterion', 'max_depth', 'min_samples_split'),
 (['gini', 'entropy'],
  [2, 3, 4, 5, 6, 7, 8, 9, 10],
  [2, 3, 4, 5, 6, 7, 8, 9, 10])]
keys, values = zip(*hyperparams.items())
keys
('criterion', 'max_depth', 'min_samples_split')
values
(['gini', 'entropy'],
 [2, 3, 4, 5, 6, 7, 8, 9, 10],
 [2, 3, 4, 5, 6, 7, 8, 9, 10])
import itertools
list(itertools.product(*values))[::10]
[('gini', 2, 2),
 ('gini', 3, 3),
 ('gini', 4, 4),
 ('gini', 5, 5),
 ('gini', 6, 6),
 ('gini', 7, 7),
 ('gini', 8, 8),
 ('gini', 9, 9),
 ('gini', 10, 10),
 ('entropy', 3, 2),
 ('entropy', 4, 3),
 ('entropy', 5, 4),
 ('entropy', 6, 5),
 ('entropy', 7, 6),
 ('entropy', 8, 7),
 ('entropy', 9, 8),
 ('entropy', 10, 9)]
v = next(itertools.product(*values))
print(v)
('gini', 2, 2)
print_vec(*zip(keys, v))
[('criterion', 'gini') 
('max_depth', 2) 
('min_samples_split', 2)]
def print_dict(**kwargs):
    print(kwargs)

print_dict(**(dict(zip(keys, v))))
{'criterion': 'gini', 'max_depth': 2, 'min_samples_split': 2}
out = {}
for v in itertools.product(*values):
    params = dict(zip(keys, v))
    dt= DecisionTreeClassifier(**params)
    dt.fit(train_df[['feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])
    out[(params['criterion'], params['max_depth'], params['min_samples_split'])] = dt.score(validation_df[['feature1', 'feature2', 'feature3', 'feature4']], validation_df['target'])
    
pd.Series(out).sort_values(ascending=False)
entropy  10  10    0.766667
         7   7     0.766667
         9   8     0.766667
         8   10    0.766667
             9     0.766667
                     ...   
         3   3     0.500000
             4     0.500000
             5     0.500000
gini     10  5     0.500000
             3     0.500000
Length: 162, dtype: float64