import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.ensemble import RandomForestClassifier
from IPython.display import Image
# To plot trees in forest via graphviz
from sklearn.tree import export_graphviz
import graphviz
try:
from latexify import latexify, format_axes
=2)
latexify(columnsexcept:
pass
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
Random Forest Feature Importance
# Load IRIS dataset from Seaborn
= sns.load_dataset('iris')
iris iris
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
... | ... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
# classes
iris.species.unique()
array(['setosa', 'versicolor', 'virginica'], dtype=object)
=20) plt.hist(iris.sepal_width, bins
(array([ 1., 3., 4., 3., 8., 14., 14., 10., 26., 11., 19., 12., 6.,
4., 9., 2., 1., 1., 1., 1.]),
array([2. , 2.12, 2.24, 2.36, 2.48, 2.6 , 2.72, 2.84, 2.96, 3.08, 3.2 ,
3.32, 3.44, 3.56, 3.68, 3.8 , 3.92, 4.04, 4.16, 4.28, 4.4 ]),
<BarContainer object of 20 artists>)
=iris, x="sepal_length") sns.kdeplot(data
='kde') sns.displot(iris.sepal_length.values, kind
=iris, x="sepal_length", kind='kde') sns.displot(data
"species")["petal_length"].mean() iris.groupby(
species
setosa 1.462
versicolor 4.260
virginica 5.552
Name: petal_length, dtype: float64
# Pairplot
="species") sns.pairplot(iris, hue
# Divide dataset into X and y
= iris.iloc[:, :-1], iris.iloc[:, -1]
X, y = RandomForestClassifier(n_estimators=10,random_state=0, criterion='entropy', bootstrap=True)
rf rf.fit(X, y)
RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)
# Visualize ``each tree in the Random Forest
for i, tree in enumerate(rf.estimators_):
# Create DOT data for the i-th tree
= export_graphviz(tree, out_file=None,
dot_data =iris.columns[:-1],
feature_names=iris.species.unique(),
class_names=True, rounded=True,
filled=True,
special_characters=True,
impurity=True)
node_ids
# Use Graphviz to render the DOT data into a graph
= graphviz.Source(dot_data)
graph
# Save or display the graph (change the format as needed)
=f'../figures/ensemble/feature-imp-{i}', format='pdf', cleanup=True)
graph.render(filename=f'../figures/ensemble/feature-imp-{i}', format='png', cleanup=True) graph.render(filename
# Visualize the tree
='../figures/ensemble/feature-imp-0.png') Image(filename
rf.feature_importances_
array([0.09864748, 0.03396026, 0.32312193, 0.54427033])
- \(t\) = node
- \(N_t\) = number of observations at node \(t\)
- \(N_{t_L}\) = number of observations in the left child node of node \(t\)
- \(N_{t_R}\) = number of observations in the right child node of node \(t\)
- \(p(t)=N_t/N\) = proportion of observations in node \(t\)
- \(X_j\) = feature \(j\)
- \(j_t\) = feature used at node \(t\) for splitting
- \(i(t)\) = impurity at node \(t\) (impurity = entropy in this case)
- \(M\) = number of trees in the forest
For a particular node:
- Information gain at node \(t\) = Impurity reduction at node \(t\) = entropy(parent) - weighted entropy(children)
\(\Delta i(t) = i(t) - \frac{N_{t_L}}{N_t} i(t_L) - \frac{N_{t_r}}{N_t} i(t_R)\)
For a tree:
Importance of feature \(X_j\) is given by:
\(\text{Imp}(X_j) = \sum_{t \in \varphi_{m}} 1(j_t = j) \Big[ p(t) \Delta i(t) \Big]\)
For a forest:
Importance of feature \(X_j\) for an ensemble of \(M\) trees \(\varphi_{m}\) is:
\[\begin{equation*} \text{Imp}(X_j) = \frac{1}{M} \sum_{m=1}^M \sum_{t \in \varphi_{m}} 1(j_t = j) \Big[ p(t) \Delta i(t) \Big] \end{equation*}\]
1-1/np.e
0.6321205588285577
= 150
N 1-1/np.e)*N (
94.81808382428365
rf.feature_importances_
array([0.09864748, 0.03396026, 0.32312193, 0.54427033])
= rf.estimators_ list_of_trees_in_rf
= list_of_trees_in_rf[0] tree_0
tree_0.feature_importances_
array([0.00397339, 0.01375245, 0.35802357, 0.6242506 ])
= []
s for tree in rf.estimators_:
s.append(tree.feature_importances_)
=0) np.array(s).mean(axis
array([0.09864748, 0.03396026, 0.32312193, 0.54427033])
rf.feature_importances_
array([0.09864748, 0.03396026, 0.32312193, 0.54427033])
= rf.estimators_[0]
tree_0 tree_0.feature_importances_
array([0.00397339, 0.01375245, 0.35802357, 0.6242506 ])
# take one tree
= rf.estimators_[0].tree_ tree
tree.feature
array([ 3, -2, 2, 3, -2, 1, -2, -2, 0, -2, 2, -2, -2], dtype=int64)
# Creating a mapping of feature names to the feature indices
= {-2: 'Leaf', 0: 'sepal_length', 1: 'sepal_width', 2: 'petal_length', 3: 'petal_width'}
mapping
# print the node number along with the corresponding feature name
for node in range(tree.node_count):
print(f'Node {node}: {mapping[tree.feature[node]]}')
Node 0: petal_width
Node 1: Leaf
Node 2: petal_length
Node 3: petal_width
Node 4: Leaf
Node 5: sepal_width
Node 6: Leaf
Node 7: Leaf
Node 8: sepal_length
Node 9: Leaf
Node 10: petal_length
Node 11: Leaf
Node 12: Leaf
id = 2
id], tree.children_right[id] tree.children_left[
(3, 8)
def print_child_id(tree, node):
'''
Prints the child node ids of a given node.
tree: tree object
node: int
'''
# check if leaf
= tree.children_left[node], tree.children_right[node]
l, r if l == -1 and r == -1:
return None, None
return tree.children_left[node], tree.children_right[node]
0) print_child_id(tree,
(1, 2)
tree.impurity
array([1.57310798, 0. , 0.98464683, 0.34781691, 0. ,
0.81127812, 0. , 0. , 0.12741851, 0. ,
0.2108423 , 0. , 0. ])
def all_data(tree, node):
'''
Returns all the data required to calculate the information gain.
'''
# get the child nodes
= print_child_id(tree, node)
left, right
# check if leaf, then return None
if left is None:
return None
# get the data
= tree.impurity[node]
entropy_node = tree.impurity[left]
entropy_left = tree.impurity[right]
entropy_right
# N = total number of samples considered during bagging, therefore, it is equal to the number of samples at the root node
= tree.n_node_samples[0]
N
# n_l = number of samples at the left child node
= tree.n_node_samples[left]
n_l
# n_r = number of samples at the right child node
= tree.n_node_samples[right]
n_r
# n_t = total number of samples at the node
= n_l + n_r
n_t
= mapping[tree.feature[node]]
feature
# calculate the information gain
= entropy_node - (n_l/n_t * entropy_left + n_r/n_t * entropy_right)
info_gain_t
return info_gain_t, N, n_l, n_r, n_t, feature
# Calculate the importance of each features using the information gain for a tree
= {}
scores for node in range(tree.node_count):
# Add the information gain of the node to the dictionary if it is not a leaf node
try:
= all_data(tree, node)
ig, N, n_l, n_r, n_t, feature = n_t / N
p_t = scores.get(feature, 0) + p_t * ig
scores[feature]
# Skip if it is a leaf node
except:
continue
= pd.Series(scores)
ser = ser/ser.sum()
info_gain_tree =False) info_gain_tree.sort_values(ascending
petal_width 0.639307
petal_length 0.340335
sepal_width 0.016459
sepal_length 0.003899
dtype: float64
# Feature importance using sklearn for a tree
= tree.compute_feature_importances()
sklearn_imp =iris.columns[:-1]).sort_values(ascending=False) pd.Series(sklearn_imp, index
petal_width 0.624251
petal_length 0.358024
sepal_width 0.013752
sepal_length 0.003973
dtype: float64
# Feature importance using sklearn for the forest
= np.array([x.tree_.compute_feature_importances() for x in rf.estimators_]).mean(axis=0)
sklearn_imp_forest =iris.columns[:-1]).sort_values(ascending=False) pd.Series(sklearn_imp_forest, index
petal_width 0.544270
petal_length 0.323122
sepal_length 0.098647
sepal_width 0.033960
dtype: float64
= pd.Series(sklearn_imp_forest, index=iris.columns[:-1])
ser ='bar', rot=0)
ser.plot(kind
format_axes(plt.gca())'../figures/ensemble/feature-imp-forest.pdf', bbox_inches='tight') plt.savefig(
Aside:
tree.tree_.feature
returns the feature used at each node to divide the node into two child nodes with the below given mapping. The sequence of the features is the same as the column sequence of the input data.
- -2: leaf node
- 0: sepal_length
- 1: sepal_width
- 2: petal_length
- 3: petal_width
tree.tree_.children_left[node]
returns the node number of the left child of the node
tree.tree_.children_right[node]
returns the node number of the right child of the node
if there is no left or right child, it returns -1
Bootstrap code:
in the random_forest.fit() function