Decision Trees Real Output

Author

Nipun Batra

Published

January 1, 2024

Open In Colab

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# Retina mode
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from latexify import latexify, format_axes

df = pd.read_csv("../datasets/tennis-real-output.csv", index_col=[0])

df

	Outlook	Temp	Humidity	Wind	Minutes Played
Day
D1	Sunny	Hot	High	Weak	20
D2	Sunny	Hot	High	Strong	24
D3	Overcast	Hot	High	Weak	40
D4	Rain	Mild	High	Weak	50
D5	Rain	Cool	Normal	Weak	60
D6	Rain	Cool	Normal	Strong	10
D7	Overcast	Cool	Normal	Strong	4
D8	Sunny	Mild	High	Weak	10
D9	Sunny	Cool	Normal	Weak	60
D10	Rain	Mild	Normal	Weak	40
D11	Sunny	Mild	High	Strong	45
D12	Overcast	Mild	High	Strong	40
D13	Overcast	Hot	Normal	Weak	35
D14	Rain	Mild	High	Strong	20

mean_mins = df["Minutes Played"].mean()
print(mean_mins)

32.714285714285715

initial_mse = ((df["Minutes Played"] - mean_mins) ** 2).mean()
print(initial_mse)

311.3469387755102

# Explore MSE for different splits based on the "Outlook" attribute
weighted_total_mse = 0.0
for category in df["Wind"].unique():
    subset = df[df["Wind"] == category]
    
    # Calculate MSE for the subset
    mse_subset = ((subset["Minutes Played"] - subset["Minutes Played"].mean()) ** 2).mean()
    
    # Calculate the weighted MSE
    weighted_mse = (len(subset) / len(df)) * mse_subset
    weighted_total_mse = weighted_total_mse + weighted_mse
    
    print(subset["Minutes Played"].values)
    print(f"Wind: {category}")
    print("Subset MSE:", mse_subset)
    print(f"Weighted MSE = {len(subset)}/{len(df)} * {mse_subset:0.4} = {weighted_mse:0.4}")
    print("\n")

print("Weighted total MSE:", weighted_total_mse)

[20 40 50 60 10 60 40 35]
Wind: Weak
Subset MSE: 277.734375
Weighted MSE = 8/14 * 277.7 = 158.7


[24 10  4 45 40 20]
Wind: Strong
Subset MSE: 218.13888888888889
Weighted MSE = 6/14 * 218.1 = 93.49


Weighted total MSE: 252.19345238095235

reduction_mse_wind = initial_mse - weighted_total_mse
print(reduction_mse_wind)

59.15348639455783

def reduction_mse(df_dataset, input_attribute, target_attribute):
    # Calculate the initial MSE
    mean_target = df_dataset[target_attribute].mean()
    initial_mse = ((df_dataset[target_attribute] - mean_target) ** 2).mean()
    weighted_total_mse = 0.0

    for category in df_dataset[input_attribute].unique():
        subset = df_dataset[df_dataset[input_attribute] == category]
        mse_subset = ((subset[target_attribute] - subset[target_attribute].mean()) ** 2).mean()
        
        weighted_mse = (len(subset) / len(df_dataset)) * mse_subset
        weighted_total_mse = weighted_total_mse + weighted_mse
    
    return initial_mse - weighted_total_mse

reduction = {}
for attribute in ["Outlook", "Temp", "Humidity", "Wind"]:
    reduction[attribute] = reduction_mse(df, attribute, "Minutes Played")
    
reduction_ser = pd.Series(reduction)

latexify()

bars = reduction_ser.plot(kind='bar', rot=0, color='k')
format_axes(plt.gca())

# Add values on top of the bars
for bar in bars.patches:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 2), ha='center', va='bottom')

plt.xlabel("Attribute")
plt.ylabel("Reduction in MSE")
plt.savefig("../figures/decision-trees/discrete-input-real-output-level-1.pdf")

Other Formats