import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Retina display
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
from latexify import latexify, format_axes
= 5
FIG_WIDTH = 4 FIG_HEIGHT
Decision Trees [Real I/P Real O/P, Bias vs Variance]
ML
Tutorial
# Create dataset
= np.array([1, 2, 3, 4, 5, 6])
x = np.array([0, 0, 1, 1, 2, 2])
y
# plot data
=2)
latexify(columns='k')
plt.scatter(x, y, color
format_axes(plt.gca()) "../figures/decision-trees/ri-ro-dataset.pdf") plt.savefig(
# Depth 0 tree
# Average of all y values
= np.mean(y)
y_pred # Plot data
=2)
latexify(columns='C1', label='data')
plt.scatter(x, y, color# Plot prediction
0, 7], [y_pred, y_pred], color='k', linestyle='-', label='Prediction')
plt.plot([
format_axes(plt.gca())
plt.legend()"../figures/decision-trees/ri-ro-depth-0.pdf") plt.savefig(
from sklearn.tree import DecisionTreeRegressor
def create_DT_Regressor(x, y, depth, filename):
= DecisionTreeRegressor(max_depth=depth)
dt -1, 1), y)
dt.fit(x.reshape(
# Plot data
=2)
latexify(columns='C1', label='Data')
plt.scatter(x, y, color
= np.linspace(0, 7, 500)
x_test = dt.predict(x_test.reshape(-1, 1))
y_test ='k', label='Prediction')
plt.plot(x_test, y_test, color
format_axes(plt.gca())
plt.legend()f"../figures/decision-trees/{filename}.pdf")
plt.savefig(return dt
= create_DT_Regressor(x, y, 1, "ri-ro-depth-1") dt_one
from sklearn.tree import export_graphviz
import graphviz
def create_graph(dt, filename, feature_names=['x']):
= export_graphviz(dt, out_file=None, feature_names=feature_names, filled=True)
dot_data = graphviz.Source(dot_data)
graph format = 'pdf'
graph.f"../figures/decision-trees/{filename}")
graph.render(return graph
"ri-ro-depth-1-sklearn") create_graph(dt_one,
= create_DT_Regressor(x, y, 2, "ri-ro-depth-2") dt_two
"ri-ro-depth-2-sklearn") create_graph(dt_two,
= create_DT_Regressor(x, y, 3, "ri-ro-depth-3")
dt_three
"ri-ro-depth-3-sklearn") create_graph(dt_three,
Sine Dataset
### Sine daatset
= np.linspace(0, 2*np.pi, 200)
x = np.sin(x)
y
=2)
latexify(columns='k', s=1)
plt.scatter(x, y, color
format_axes(plt.gca())"../figures/decision-trees/sine-dataset.pdf") plt.savefig(
= create_DT_Regressor(x, y, 1, "sine-depth-1") dt_sine_one
= np.mean(y)
mean_y = y - mean_y
error_vector = np.sum(error_vector**2)
squared_error = squared_error / len(y)
mean_squared_error print(f"Mean squared error: {mean_squared_error:0.4f}")
Mean squared error: 0.4975
"sine-depth-1-sklearn") create_graph(dt_sine_one,
= np.pi
split = y[x < split]
left = y[x >= split]
right
= np.mean(left)
mean_left = np.mean(right)
mean_right
= left - mean_left
error_vector_left = right - mean_right
error_vector_right
= np.sum(error_vector_left**2)
squared_error_left = np.sum(error_vector_right**2)
squared_error_right
= squared_error_left / len(left)
mean_squared_error_left = squared_error_right / len(right)
mean_squared_error_right
print(f"Mean squared error left: {mean_squared_error_left:0.4f}")
print(f"Mean value left: {mean_left:0.4f}")
print(f"Number of samples in left: {len(left)}")
print("---"*20)
print(f"Mean squared error right: {mean_squared_error_right:0.4f}")
print(f"Mean value right: {mean_right:0.4f}")
print(f"Number of samples in right: {len(right)}")
= len(left) / len(y) * mean_squared_error_left + len(right) / len(y) * mean_squared_error_right
weighted_error
print("---"*20)
print(f"Weighted error: {weighted_error:0.4f}")
= mean_squared_error - weighted_error
reduction print(f"Reduction: {reduction:0.4f}")
Mean squared error left: 0.0963
Mean value left: 0.6334
Number of samples in left: 100
------------------------------------------------------------
Mean squared error right: 0.0963
Mean value right: -0.6334
Number of samples in right: 100
------------------------------------------------------------
Weighted error: 0.0963
Reduction: 0.4012
= create_DT_Regressor(x, y, 2, "sine-depth-2") dt_sine_two
"sine-depth-2-sklearn") create_graph(dt_sine_two,
= create_DT_Regressor(x, y, 4, "sine-depth-4") dt_sine_four
Bias-Variance Tradeoff - Dataset I
### Dataset for showing bias-variance tradeoff
= np.array([[1, 1],[2, 1],[3, 1],[5, 1],
X 6, 1],[7, 1],[1, 2],[2, 2],
[6, 2],[7, 2],[1, 4],[7, 4]])
[= np.array([0, 0, 0, 1, 1, 1, 0, 1, 0, 1 ,0, 1])
y
0], X[:, 1], c=y)
plt.scatter(X[:, "$x_1$")
plt.xlabel("$x_2$")
plt.ylabel(
format_axes(plt.gca())"../figures/decision-trees/bias-variance-dataset.pdf") plt.savefig(
from sklearn.tree import DecisionTreeClassifier
def create_DT_Classifier(X,y,depth,filename):
= DecisionTreeClassifier(max_depth=depth)
dt
dt.fit(X, y)
# Predict in entire 2d space and contour plot
= np.linspace(0, 8, 100)
x1 = np.linspace(0, 5, 100)
x2
= np.meshgrid(x1, x2)
X1, X2 = np.stack([X1.flatten(), X2.flatten()], axis=1)
X_test = dt.predict(X_test)
y_test 0], X[:, 1], c=y)
plt.scatter(X[:, =0.1, cmap='coolwarm')
plt.contourf(X1, X2, y_test.reshape(X1.shape), alpha
format_axes(plt.gca())f"../figures/decision-trees/{filename}.pdf")
plt.savefig(return dt
= create_DT_Classifier(X, y, 1, "bias-variance-depth-1") dt_bias_variance_one
"bias-variance-depth-1-sklearn", feature_names=['x1', 'x2']) create_graph(dt_bias_variance_one,
= create_DT_Classifier(X, y, None, "bias-variance-full-depth") dt_bias_variance_full_depth
"bias-variance-full-depth-sklearn", feature_names=['x1', 'x2']) create_graph(dt_bias_variance_full_depth,
Bias-Variance Tradeoff - Dataset II
# Bias variance dataset 2
# X is all integers from (1, 1) to (6, 6)
= np.array([[i, j] for i in range(1, 7) for j in range(1, 7)])
X = np.zeros(len(X), dtype=int)
y 2 <= X[:, 0]) & (X[:, 0] <= 5) & (2 <= X[:, 1]) & (X[:, 1] <= 5)] = 1
y[(0], X[:, 1], c=y) plt.scatter(X[:,
= (X[:, 0] == 3) & (X[:, 1] == 3) | (X[:, 0] == 4) & (X[:, 1] == 4)
special_condition = 0
y[special_condition]
0], X[:, 1], c=y)
plt.scatter(X[:,
format_axes(plt.gca())"../figures/decision-trees/bias-variance-dataset-2.pdf") plt.savefig(
# X_test random uniform frmo (1, 1) to (6, 6) of size 1000
= np.random.uniform(1, 6, size=(1000, 2))
X_test = np.zeros(len(X_test), dtype=int)
y_test 2 <= X_test[:, 0]) & (X_test[:, 0] <= 5) & (2 <= X_test[:, 1]) & (X_test[:, 1] <= 5)] = 1
y_test[(
0], X_test[:, 1], c=y_test, alpha=0.1)
plt.scatter(X_test[:,
format_axes(plt.gca())"../figures/decision-trees/bias-variance-dataset-2-test.pdf") plt.savefig(
def create_DT_Classifier_with_graph(X,y,depth,filename):
= DecisionTreeClassifier(max_depth=depth)
dt
dt.fit(X, y)
# Predict in entire 2d space and contour plot
= np.linspace(0.5, 6.5, 100)
x1 = np.linspace(0.5, 6.5, 100)
x2
= np.meshgrid(x1, x2)
X1, X2 = np.stack([X1.flatten(), X2.flatten()], axis=1)
X_contour = dt.predict(X_contour)
y_contour
0], X[:, 1], c=y)
plt.scatter(X[:, =0.1, cmap='coolwarm')
plt.contourf(X1, X2, y_contour.reshape(X1.shape), alpha
format_axes(plt.gca())f"../figures/decision-trees/{filename}.pdf")
plt.savefig(
# Export tree
= export_graphviz(dt, out_file=None, feature_names=['x1', 'x2'], filled=True)
dot_data = graphviz.Source(dot_data)
graph format = 'pdf'
graph.f"../figures/decision-trees/{filename}-sklearn") graph.render(
#Underfitting
2, "bias-variance-depth-2") create_DT_Classifier_with_graph(X, y,
#Overfitting
None, "bias-variance-full-depth") create_DT_Classifier_with_graph(X, y,
#Good Fit
4, "bias-variance-good-fit") create_DT_Classifier_with_graph(X, y,
Test Accuracies
from sklearn.metrics import accuracy_score
### Train and test accuracy vs depth
= np.arange(2, 10)
depths = {}
train_accs = {}
test_accs for depth in depths:
= DecisionTreeClassifier(max_depth=depth)
dt
dt.fit(X, y)= accuracy_score(y, dt.predict(X))
train_accs[depth] = accuracy_score(y_test, dt.predict(X_test)) test_accs[depth]
= pd.Series(train_accs)
train_accs = pd.Series(test_accs) test_accs
train_accs
2 0.722222
3 0.833333
4 0.944444
5 0.944444
6 0.944444
7 0.944444
8 0.944444
9 0.944444
dtype: float64
= train_accs.plot(label='Train')
ax ='Test', ax=ax)
test_accs.plot(label"Depth")
plt.xlabel("Accuracy")
plt.ylabel(
plt.legend()0, 1.1)
plt.ylim(
format_axes(plt.gca())"../figures/decision-trees/bias-variance-accuracy-vs-depth.pdf")
plt.savefig(
# Highlight area of underfitting (depth < 4) fill with green
0, 1, where=depths <= 4, color='g', alpha=0.1, label='Underfitting')
plt.fill_between(depths,
plt.legend()"../figures/decision-trees/bias-variance-accuracy-vs-depth-underfitting.pdf")
plt.savefig(
# Highlight area of overfitting (depth >7 4) fill with red
0, 1, where=depths >= 7, color='r', alpha=0.1, label='Overfitting')
plt.fill_between(depths,
plt.legend()"../figures/decision-trees/bias-variance-accuracy-vs-depth-overfitting.pdf")
plt.savefig(
# Highlight good fit area (4 < depth < 7) fill with blue
0, 1, where=(depths >= 4) & (depths <= 7), color='b', alpha=0.1, label='Good fit')
plt.fill_between(depths,
plt.legend()"../figures/decision-trees/bias-variance-accuracy-vs-depth-good-fit.pdf") plt.savefig(
# Slight variation of the dataset leads to a completely different tree
= np.zeros(len(X), dtype=int)
y 2 <= X[:, 0]) & (X[:, 0] <= 5) & (2 <= X[:, 1]) & (X[:, 1] <= 5)] = 1
y[(= (X[:, 0] == 3) & (X[:, 1] == 3) | (X[:, 0] == 4) & (X[:, 1] == 3)
special_condition = 0
y[special_condition]
0], X[:, 1], c=y)
plt.scatter(X[:,
format_axes(plt.gca())"../figures/decision-trees/bias-variance-dataset-2-2.pdf") plt.savefig(
None, "bias-variance-full-depth-2") create_DT_Classifier_with_graph(X, y,