Decision Trees Discrete Input and Discrete Output

ML
Author

Nipun Batra

Published

January 21, 2024

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# Retina mode
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from latexify import latexify, format_axes
df = pd.read_csv("../datasets/tennis-discrete-output.csv", index_col=0)
df
Outlook Temp Humidity Windy Play
Day
D1 Sunny Hot High Weak No
D2 Sunny Hot High Strong No
D3 Overcast Hot High Weak Yes
D4 Rain Mild High Weak Yes
D5 Rain Cool Normal Weak Yes
D6 Rain Cool Normal Strong No
D7 Overcast Cool Normal Strong Yes
D8 Sunny Mild High Weak No
D9 Sunny Cool Normal Weak Yes
D10 Rain Mild Normal Weak Yes
D11 Sunny Mild Normal Strong Yes
D12 Overcast Mild High Strong Yes
D13 Overcast Hot Normal Weak Yes
D14 Rain Mild High Strong No
def entropy(ser):
    """
    Calculate entropy for a categorical variable.

    Parameters:
    - ser: pd.Series of categorical data

    Returns:
    - Entropy value
    """
    # Count the occurrences of each unique value in the series
    value_counts = ser.value_counts()

    # Calculate the probabilities of each unique value
    probabilities = value_counts / len(ser)

    # Calculate entropy using the formula: H(S) = -p1*log2(p1) - p2*log2(p2) - ...
    entropy_value = -np.sum(probabilities * np.log2(probabilities))

    return entropy_value
    
entropy(df["Play"])
0.9402859586706311