Decision Trees Real Output

ML
Author

Nipun Batra

Published

January 1, 2024

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# Retina mode
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from latexify import latexify, format_axes
df = pd.read_csv("../datasets/tennis-real-output.csv", index_col=[0])
df
Outlook Temp Humidity Wind Minutes Played
Day
D1 Sunny Hot High Weak 20
D2 Sunny Hot High Strong 24
D3 Overcast Hot High Weak 40
D4 Rain Mild High Weak 50
D5 Rain Cool Normal Weak 60
D6 Rain Cool Normal Strong 10
D7 Overcast Cool Normal Strong 4
D8 Sunny Mild High Weak 10
D9 Sunny Cool Normal Weak 60
D10 Rain Mild Normal Weak 40
D11 Sunny Mild High Strong 45
D12 Overcast Mild High Strong 40
D13 Overcast Hot Normal Weak 35
D14 Rain Mild High Strong 20
mean_mins = df["Minutes Played"].mean()
print(mean_mins)
32.714285714285715
initial_mse = ((df["Minutes Played"] - mean_mins) ** 2).mean()
print(initial_mse)
311.3469387755102
# Explore MSE for different splits based on the "Outlook" attribute
weighted_total_mse = 0.0
for category in df["Wind"].unique():
    subset = df[df["Wind"] == category]
    
    # Calculate MSE for the subset
    mse_subset = ((subset["Minutes Played"] - subset["Minutes Played"].mean()) ** 2).mean()
    
    # Calculate the weighted MSE
    weighted_mse = (len(subset) / len(df)) * mse_subset
    weighted_total_mse = weighted_total_mse + weighted_mse
    
    print(subset["Minutes Played"].values)
    print(f"Wind: {category}")
    print("Subset MSE:", mse_subset)
    print(f"Weighted MSE = {len(subset)}/{len(df)} * {mse_subset:0.4} = {weighted_mse:0.4}")
    print("\n")

print("Weighted total MSE:", weighted_total_mse)
[20 40 50 60 10 60 40 35]
Wind: Weak
Subset MSE: 277.734375
Weighted MSE = 8/14 * 277.7 = 158.7


[24 10  4 45 40 20]
Wind: Strong
Subset MSE: 218.13888888888889
Weighted MSE = 6/14 * 218.1 = 93.49


Weighted total MSE: 252.19345238095235
reduction_mse_wind = initial_mse - weighted_total_mse
print(reduction_mse_wind)
59.15348639455783
def reduction_mse(df_dataset, input_attribute, target_attribute):
    # Calculate the initial MSE
    mean_target = df_dataset[target_attribute].mean()
    initial_mse = ((df_dataset[target_attribute] - mean_target) ** 2).mean()
    weighted_total_mse = 0.0

    for category in df_dataset[input_attribute].unique():
        subset = df_dataset[df_dataset[input_attribute] == category]
        mse_subset = ((subset[target_attribute] - subset[target_attribute].mean()) ** 2).mean()
        
        weighted_mse = (len(subset) / len(df_dataset)) * mse_subset
        weighted_total_mse = weighted_total_mse + weighted_mse
    
    return initial_mse - weighted_total_mse

    
reduction = {}
for attribute in ["Outlook", "Temp", "Humidity", "Wind"]:
    reduction[attribute] = reduction_mse(df, attribute, "Minutes Played")
    
reduction_ser = pd.Series(reduction)
latexify()
bars = reduction_ser.plot(kind='bar', rot=0, color='k')
format_axes(plt.gca())

# Add values on top of the bars
for bar in bars.patches:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 2), ha='center', va='bottom')

plt.xlabel("Attribute")
plt.ylabel("Reduction in MSE")
plt.savefig("../figures/decision-trees/discrete-input-real-output-level-1.pdf")