import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, tree
%matplotlib widget
plt.close("all")Decision Tree Induction¶
We will use the iris dataset from sklearn.datasets package
iris = datasets.load_iris()Recall that the classification task is to train a model that can automatically classify the species (target) based on the lengths and widths of the petals and sepals (input features).
To build a decision tree, we use DecisionTreeClassifier from sklearn.tree and apply its fit method on the training set.
clf_gini = tree.DecisionTreeClassifier(random_state=0).fit(iris.data, iris.target)To display the decision tree, we can use the function plot_tree from sklearn.tree:
plt.figure()
tree.plot_tree(clf_gini)
plt.show()To make the decision tree looks better, we can provide additional options:
options = {
"feature_names": iris.feature_names,
"class_names": iris.target_names,
"label": "root",
"filled": True,
"node_ids": True,
"proportion": True,
"rounded": True,
"fontsize": 7,
} # store options as dict for reuse
plt.figure(figsize=(9, 6))
tree.plot_tree(clf_gini, **options) # ** unpacks dict as keyword arguments
plt.show()In particular, the iris setosa is distinguished immediately after checking the petal width/length.
# YOUR CODE HERE
raise NotImplementedError()
plt.figure(figsize=(9, 6))
tree.plot_tree(clf_entropy, **options)
plt.show()YOUR ANSWER HERE
Splitting Criterion¶
To induce a good decision tree efficiently, the splitting criterion is chosen
- greedily to maximize the reduction in impurity and
- recursively starting from the root.
Overview using pandas¶
To have a rough idea of what are good features to split on, we will use pandas DataFrame
to operate on the iris dataset.
# write the input features first
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
# append the target values to the last column
df["target"] = iris.target
df.target = df.target.astype("category").cat.rename_categories(
dict(zip(range(3), iris.target_names))
)
dfTo display some statistics of the input features for different classes:
df.groupby("target", observed=False).boxplot(rot=90, layout=(1, 3), figsize=(7, 9))
df.groupby("target", observed=False).agg(["mean", "std"]).round(2)YOUR ANSWER HERE
Measuring impurity¶
Suppose nearly all instances of a dataset belong to the same class. In that case, we can return the majority class as the decision without further splitting. A measure of impurity is the Gini impurity index, defined as follows:
We can represent a distribution simply as a numpy array. To return the empirical class distributions of the iris dataset:
def dist(values):
"""Returns the empirical distribution of the given 1D array of values as a
1D array of probabilities. (The ordering is immaterial.)"""
counts = np.unique(values, return_counts=True)[-1]
return counts / counts.sum()
print(f"Distribution of target: {dist(iris.target).round(3)}")The Gini impurity index can be implemented as follows:
def g(p):
"""Returns the Gini impurity of the distribution p."""
return 1 - (p**2).sum()
print(f"Gini(D) = {g(dist(iris.target)):.3g}")Another measure of impurity uses the information measure called entropy in information theory:
def h(p):
"""Returns the entropy of distribution p (1D array)."""
p = np.array(p)
p = p[(p > 0) & (p < 1)] # 0 log 0 = 1 log 1 = 0
# YOUR CODE HERE
raise NotImplementedError()
print(f"Info(D): {h(dist(iris.target)):.3g}")# tests
assert np.isclose(h([1 / 2, 1 / 2]), 1)
assert np.isclose(h([1, 0]), 0)
assert np.isclose(h([1 / 2, 1 / 4, 1 / 4]), 1.5)# hidden testsDrop in impurity¶
We will consider the binary splitting criterion in particular, which gives
where
- denotes the target,
- denotes the empirical distribution, and
- , , and denote the empirical probability mass functions of with or without conditioning.
def drop_in_gini(X, Y, s):
"""Returns the drop in Gini impurity of the target Y
for the binary splitting criterion X <= s.
Parameters
----------
X: 1D array
Input feature values for different instances.
Y: 1D array
Target values corresponding to X.
s: Splitting point for X.
"""
S = X <= s
q = S.mean()
return g(dist(Y)) - q * g(dist(Y[S])) - (1 - q) * g(dist(Y[~S]))
X, Y = df["petal width (cm)"], df.target
print(f"Drop in Gini: {drop_in_gini(X, Y, 0.8):.4g}")To compute the best splitting point for a given input feature, we check every consecutive mid-points of the observed feature values:
def find_best_split_pt(X, Y, gain_function):
"""Return the best splitting points and the maximum gain evaluated using
gain_function for the split X <= s and target Y.
Parameters
----------
X: 1D array
Input feature values for different instances.
Y: 1D array
Target values corresponding to X.
gain_function: function of (X, Y, x)
A function such as drop_in_gini for evaluating a
splitting criterion X <= s.
Returns
-------
tuple: (s, g) where s is the best splitting point and g is the maximum gain.
See also
--------
drop_in_gini
"""
values = np.sort(np.unique(X))
split_pts = (values[1:] + values[:-1]) / 2
gain = np.array([gain_function(X, Y, s) for s in split_pts])
i = np.argmax(gain)
return split_pts[i], gain[i]
print(
"""Best split point: {0:.3g}
Maximum gain: {1:.3g}""".format(
*find_best_split_pt(X, Y, drop_in_gini)
)
)The following ranks the features according to the gains of their best binary splits:
rank_by_gini = pd.DataFrame(
{
"feature": feature,
**(lambda s, g: {"split point": s, "gain": g})(
*find_best_split_pt(df[feature], df.target, drop_in_gini)
),
}
for feature in iris.feature_names
).sort_values(by="gain", ascending=False)
rank_by_giniUsing the entropy to measure impurity, we have the following alternative gain function:
We will again consider the binary splitting criterion in particular, which gives
def gain(X, Y, s):
"""Returns the information Gain of Y for the split X <= s.
Parameters
----------
X: 1D array
Input feature values for different instances.
Y: 1D array
Target values corresponding to X.
s: Splitting point for X.
"""
S = X <= s
q = S.mean()
# YOUR CODE HERE
raise NotImplementedError()
print(f"Information gain: {gain(X, Y, 0.8):.4g}")# tests
rank_by_entropy = pd.DataFrame(
{
"feature": feature,
**(lambda s, g: {"split point": s, "gain": g})(
*find_best_split_pt(df[feature], df.target, gain)
),
}
for feature in iris.feature_names
).sort_values(by="gain", ascending=False)
rank_by_entropy# hidden testsThe C4.5 induction algorithm uses information gain ratio instead of information gain:
def gain_ratio(X, Y, split_pt):
S = X <= split_pt
q = S.mean()
# YOUR CODE HERE
raise NotImplementedError()# tests
rank_by_gain_ratio = pd.DataFrame(
{
"feature": feature,
**(lambda s, g: {"split point": s, "gain": g})(
*find_best_split_pt(df[feature], df.target, gain_ratio)
),
}
for feature in iris.feature_names
).sort_values(by="gain", ascending=False)
rank_by_gain_ratio# hidden testsYOUR ANSWER HERE