Grid Search

Author

Nipun Batra

Published

January 17, 2023

Hyperparameter Tuning

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

Dataset creation

# Create a DataFrame for classification containing four real features and one binary target

df = pd.DataFrame({
    'feature1': np.random.randint(0, 100, 100),
    'feature2': np.random.randint(0, 100, 100),
    'feature3': np.random.randint(0, 100, 100),
    'feature4': np.random.randint(0, 100, 100),
    'target': np.random.randint(0, 2, 100)
})

df.head()

	feature1	feature2	feature3	feature4	target
0	29	14	66	83	1
1	68	70	87	72	1
2	42	5	40	67	1
3	2	54	79	0	1
4	81	36	35	75	0

train_df = df[:50]
validation_df = df[50:80]

dt = DecisionTreeClassifier()
dt.fit(train_df[['feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])
dt

DecisionTreeClassifier()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

dt.score(validation_df[['feature1', 'feature2', 'feature3', 'feature4']], validation_df['target'])

0.5333333333333333

dt = DecisionTreeClassifier(criterion='entropy', max_depth=2)
dt.fit(train_df[['feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])
dt.score(validation_df[['feature1', 'feature2', 'feature3', 'feature4']], validation_df['target'])

0.5666666666666667

hyperparams = {'criterion': ['gini', 'entropy'],
               'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],
               'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]}

out = {}
for c in hyperparams['criterion']:
    for d in hyperparams['max_depth']:
        for s in hyperparams['min_samples_split']:
            dt = DecisionTreeClassifier(criterion=c, max_depth=d, min_samples_split=s)
            dt.fit(train_df[['feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])
            out[(c, d, s)] = dt.score(validation_df[['feature1', 'feature2', 'feature3', 'feature4']], validation_df['target'])

hp_ser = pd.Series(out)
hp_ser.sort_values(ascending=False)

entropy  10  10    0.766667
         7   7     0.766667
         9   8     0.766667
         8   10    0.766667
             9     0.766667
                     ...   
gini     10  5     0.500000
         8   3     0.500000
         7   4     0.500000
             3     0.500000
         5   2     0.500000
Length: 162, dtype: float64

hp_ser.idxmax()

('entropy', 4, 6)

best_dt = DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_split=6)

best_dt.fit(df[:80][['feature1', 'feature2', 'feature3', 'feature4']], df[:80]['target'])

DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_split=6)

best_dt.score(df[80:][['feature1', 'feature2', 'feature3', 'feature4']], df[80:]['target'])

0.45

Without using multiple nested loops

print(hyperparams.items(), len(hyperparams.items()))

dict_items([('criterion', ['gini', 'entropy']), ('max_depth', [2, 3, 4, 5, 6, 7, 8, 9, 10]), ('min_samples_split', [2, 3, 4, 5, 6, 7, 8, 9, 10])]) 3

def print_vec(x, y, z):
    print(f"[{x} \n{y} \n{z}]")
print_vec(*hyperparams.items())

[('criterion', ['gini', 'entropy']) 
('max_depth', [2, 3, 4, 5, 6, 7, 8, 9, 10]) 
('min_samples_split', [2, 3, 4, 5, 6, 7, 8, 9, 10])]

list(zip(*hyperparams.items()))

[('criterion', 'max_depth', 'min_samples_split'),
 (['gini', 'entropy'],
  [2, 3, 4, 5, 6, 7, 8, 9, 10],
  [2, 3, 4, 5, 6, 7, 8, 9, 10])]

keys, values = zip(*hyperparams.items())

keys

('criterion', 'max_depth', 'min_samples_split')

values

(['gini', 'entropy'],
 [2, 3, 4, 5, 6, 7, 8, 9, 10],
 [2, 3, 4, 5, 6, 7, 8, 9, 10])

import itertools
list(itertools.product(*values))[::10]

[('gini', 2, 2),
 ('gini', 3, 3),
 ('gini', 4, 4),
 ('gini', 5, 5),
 ('gini', 6, 6),
 ('gini', 7, 7),
 ('gini', 8, 8),
 ('gini', 9, 9),
 ('gini', 10, 10),
 ('entropy', 3, 2),
 ('entropy', 4, 3),
 ('entropy', 5, 4),
 ('entropy', 6, 5),
 ('entropy', 7, 6),
 ('entropy', 8, 7),
 ('entropy', 9, 8),
 ('entropy', 10, 9)]

v = next(itertools.product(*values))
print(v)

('gini', 2, 2)

print_vec(*zip(keys, v))

[('criterion', 'gini') 
('max_depth', 2) 
('min_samples_split', 2)]

def print_dict(**kwargs):
    print(kwargs)

print_dict(**(dict(zip(keys, v))))

{'criterion': 'gini', 'max_depth': 2, 'min_samples_split': 2}

out = {}
for v in itertools.product(*values):
    params = dict(zip(keys, v))
    dt= DecisionTreeClassifier(**params)
    dt.fit(train_df[['feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])
    out[(params['criterion'], params['max_depth'], params['min_samples_split'])] = dt.score(validation_df[['feature1', 'feature2', 'feature3', 'feature4']], validation_df['target'])

pd.Series(out).sort_values(ascending=False)

entropy  10  10    0.766667
         7   7     0.766667
         9   8     0.766667
         8   10    0.766667
             9     0.766667
                     ...   
         3   3     0.500000
             4     0.500000
             5     0.500000
gini     10  5     0.500000
             3     0.500000
Length: 162, dtype: float64