import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifierGrid Search
    Grid Search
  
Hyperparameter Tuning
Dataset creation
# Create a DataFrame for classification containing four real features and one binary target
df = pd.DataFrame({
    'feature1': np.random.randint(0, 100, 100),
    'feature2': np.random.randint(0, 100, 100),
    'feature3': np.random.randint(0, 100, 100),
    'feature4': np.random.randint(0, 100, 100),
    'target': np.random.randint(0, 2, 100)
})df.head()| feature1 | feature2 | feature3 | feature4 | target | |
|---|---|---|---|---|---|
| 0 | 29 | 14 | 66 | 83 | 1 | 
| 1 | 68 | 70 | 87 | 72 | 1 | 
| 2 | 42 | 5 | 40 | 67 | 1 | 
| 3 | 2 | 54 | 79 | 0 | 1 | 
| 4 | 81 | 36 | 35 | 75 | 0 | 
train_df = df[:50]
validation_df = df[50:80]dt = DecisionTreeClassifier()
dt.fit(train_df[['feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])
dtDecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
dt.score(validation_df[['feature1', 'feature2', 'feature3', 'feature4']], validation_df['target'])0.5333333333333333dt = DecisionTreeClassifier(criterion='entropy', max_depth=2)
dt.fit(train_df[['feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])
dt.score(validation_df[['feature1', 'feature2', 'feature3', 'feature4']], validation_df['target'])0.5666666666666667hyperparams = {'criterion': ['gini', 'entropy'],
               'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],
               'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]}out = {}
for c in hyperparams['criterion']:
    for d in hyperparams['max_depth']:
        for s in hyperparams['min_samples_split']:
            dt = DecisionTreeClassifier(criterion=c, max_depth=d, min_samples_split=s)
            dt.fit(train_df[['feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])
            out[(c, d, s)] = dt.score(validation_df[['feature1', 'feature2', 'feature3', 'feature4']], validation_df['target'])hp_ser = pd.Series(out)
hp_ser.sort_values(ascending=False)entropy  10  10    0.766667
         7   7     0.766667
         9   8     0.766667
         8   10    0.766667
             9     0.766667
                     ...   
gini     10  5     0.500000
         8   3     0.500000
         7   4     0.500000
             3     0.500000
         5   2     0.500000
Length: 162, dtype: float64hp_ser.idxmax()('entropy', 4, 6)best_dt = DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_split=6)best_dt.fit(df[:80][['feature1', 'feature2', 'feature3', 'feature4']], df[:80]['target'])DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_split=6)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_split=6)
best_dt.score(df[80:][['feature1', 'feature2', 'feature3', 'feature4']], df[80:]['target'])0.45Without using multiple nested loops
print(hyperparams.items(), len(hyperparams.items()))dict_items([('criterion', ['gini', 'entropy']), ('max_depth', [2, 3, 4, 5, 6, 7, 8, 9, 10]), ('min_samples_split', [2, 3, 4, 5, 6, 7, 8, 9, 10])]) 3def print_vec(x, y, z):
    print(f"[{x} \n{y} \n{z}]")
print_vec(*hyperparams.items())[('criterion', ['gini', 'entropy']) 
('max_depth', [2, 3, 4, 5, 6, 7, 8, 9, 10]) 
('min_samples_split', [2, 3, 4, 5, 6, 7, 8, 9, 10])]list(zip(*hyperparams.items()))[('criterion', 'max_depth', 'min_samples_split'),
 (['gini', 'entropy'],
  [2, 3, 4, 5, 6, 7, 8, 9, 10],
  [2, 3, 4, 5, 6, 7, 8, 9, 10])]keys, values = zip(*hyperparams.items())keys('criterion', 'max_depth', 'min_samples_split')values(['gini', 'entropy'],
 [2, 3, 4, 5, 6, 7, 8, 9, 10],
 [2, 3, 4, 5, 6, 7, 8, 9, 10])import itertools
list(itertools.product(*values))[::10][('gini', 2, 2),
 ('gini', 3, 3),
 ('gini', 4, 4),
 ('gini', 5, 5),
 ('gini', 6, 6),
 ('gini', 7, 7),
 ('gini', 8, 8),
 ('gini', 9, 9),
 ('gini', 10, 10),
 ('entropy', 3, 2),
 ('entropy', 4, 3),
 ('entropy', 5, 4),
 ('entropy', 6, 5),
 ('entropy', 7, 6),
 ('entropy', 8, 7),
 ('entropy', 9, 8),
 ('entropy', 10, 9)]v = next(itertools.product(*values))
print(v)('gini', 2, 2)print_vec(*zip(keys, v))[('criterion', 'gini') 
('max_depth', 2) 
('min_samples_split', 2)]def print_dict(**kwargs):
    print(kwargs)
print_dict(**(dict(zip(keys, v)))){'criterion': 'gini', 'max_depth': 2, 'min_samples_split': 2}out = {}
for v in itertools.product(*values):
    params = dict(zip(keys, v))
    dt= DecisionTreeClassifier(**params)
    dt.fit(train_df[['feature1', 'feature2', 'feature3', 'feature4']], train_df['target'])
    out[(params['criterion'], params['max_depth'], params['min_samples_split'])] = dt.score(validation_df[['feature1', 'feature2', 'feature3', 'feature4']], validation_df['target'])
    pd.Series(out).sort_values(ascending=False)entropy  10  10    0.766667
         7   7     0.766667
         9   8     0.766667
         8   10    0.766667
             9     0.766667
                     ...   
         3   3     0.500000
             4     0.500000
             5     0.500000
gini     10  5     0.500000
             3     0.500000
Length: 162, dtype: float64