import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# Retina mode
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from latexify import latexify, format_axes
Decision Trees Discrete Input and Discrete Output
ML
= pd.read_csv("../datasets/tennis-discrete-output.csv", index_col=0) df
df
Outlook | Temp | Humidity | Windy | Play | |
---|---|---|---|---|---|
Day | |||||
D1 | Sunny | Hot | High | Weak | No |
D2 | Sunny | Hot | High | Strong | No |
D3 | Overcast | Hot | High | Weak | Yes |
D4 | Rain | Mild | High | Weak | Yes |
D5 | Rain | Cool | Normal | Weak | Yes |
D6 | Rain | Cool | Normal | Strong | No |
D7 | Overcast | Cool | Normal | Strong | Yes |
D8 | Sunny | Mild | High | Weak | No |
D9 | Sunny | Cool | Normal | Weak | Yes |
D10 | Rain | Mild | Normal | Weak | Yes |
D11 | Sunny | Mild | Normal | Strong | Yes |
D12 | Overcast | Mild | High | Strong | Yes |
D13 | Overcast | Hot | Normal | Weak | Yes |
D14 | Rain | Mild | High | Strong | No |
def entropy(ser):
"""
Calculate entropy for a categorical variable.
Parameters:
- ser: pd.Series of categorical data
Returns:
- Entropy value
"""
# Count the occurrences of each unique value in the series
= ser.value_counts()
value_counts
# Calculate the probabilities of each unique value
= value_counts / len(ser)
probabilities
# Calculate entropy using the formula: H(S) = -p1*log2(p1) - p2*log2(p2) - ...
= -np.sum(probabilities * np.log2(probabilities))
entropy_value
return entropy_value
"Play"]) entropy(df[
0.9402859586706311