import ipywidgets as widgets
import matplotlib.pyplot as plt
import pandas as pd
from ipywidgets import interact
from sklearn import datasets, preprocessing
from sklearn.cluster import AgglomerativeClustering
from sklearn.pipeline import make_pipeline
from util import plot_dendrogram
%matplotlib widgetIn this notebook, we continue to cluster the instances in the iris 2D dataset.
Agglomerative clustering¶
We first import the iris dataset from sklearn.datasets and store it as a DataFrame.
# load the dataset from sklearn
dataset = datasets.load_iris()
# create a DataFrame to help further analysis
df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
df["target"] = dataset.target
df.target = df.target.astype("category")
df.target = df.target.cat.rename_categories(dataset.target_names)
df # display an overview of the dataTo normalize the features followed by agglomerative clustering, we create a pipeline as follows:
from sklearn.cluster import AgglomerativeClusteringagnes_minmax_normalized = make_pipeline(
preprocessing.MinMaxScaler(),
AgglomerativeClustering(
n_clusters=3, linkage="complete", memory="private", compute_distances=True
),
)
agnes_minmax_normalizedIn the above, we configured the agglomerative clustering algorithm to use complete-linkage to return 3 clusters.
- By setting the parameter
memory, the solution will be cached to the specified folderprivate. - By setting compute_distances to
True, the cluster distances are computer for dendrogram visualization later on.
To cluster based on the two input features petal length (cm) and petal width (cm):
feature1, feature2 = "petal length (cm)", "petal width (cm)"
cluster_labels = agnes_minmax_normalized.fit_predict(df[[feature1, feature2]])
plt.figure(num=1, figsize=(10, 5))
plt.subplot(121)
plt.scatter(df[feature1], df[feature2], c=cluster_labels)
plt.title("Cluster assignment")
plt.xlabel(feature1)
plt.ylabel(feature2)
plt.subplot(122)
plt.scatter(df[feature1], df[feature2], c=dataset["target"])
plt.title("Class (ground truth)")
plt.xlabel(feature1)
plt.show()The fit_predict method fits and returns the cluster labels for the given data.
YOUR ANSWER HERE
Dendrogram¶
Further details of a clustering solution can be obtained from the fitted properties:[1]
agnes = agnes_minmax_normalized["agglomerativeclustering"]
print("Cluster labels for each sample:", *agnes.labels_)
print("Children of each non-leaf node:", *agnes.children_)
print("Distances between nodes:", *agnes.distances_.round(3))
print("Number of leaves:", agnes.n_leaves_)
print("Number of connected components:", agnes.n_connected_components_)To plot the dendrogram, we will use the function plot_dendrogram input as follows:
from util import plot_dendrogramThe code is adopted from an example in sklearn, which uses dendrogram from scipy.cluster.hierarchy. To generate the dendrogram:
plt.figure(num=2, figsize=(10, 5))
plot_dendrogram(agnes, distance_sort=True)
plt.title("Dendrogram for complete-linkage method on iris dataset")
plt.ylabel("cophenetic distance")
plt.xlabel("instance index")
plt.show()@interact(
linkage=["ward", "complete", "average", "single"],
feature1=dataset.feature_names,
feature2=dataset.feature_names,
)
def analyze_agnes(
linkage,
feature1=dataset.feature_names[2],
feature2=dataset.feature_names[3],
k=widgets.IntSlider(3, 1, 4, continuous_update=False),
):
# YOUR CODE HERE
raise NotImplementedError()
agnes = agnes_minmax_normalized["agglomerativeclustering"]
plt.figure(num=3, figsize=(10, 10))
plt.subplot(211)
plot_dendrogram(agnes)
plt.title(f"Dendrogram for {linkage}-linkage method on iris dataset")
plt.ylabel("cophenetic distance")
plt.xlabel("instance index")
plt.subplot(223)
plt.scatter(df[feature1], df[feature2], c=agnes.labels_)
plt.title("Cluster assignment")
plt.ylabel(feature2)
plt.xlabel(feature1)
plt.subplot(224)
plt.scatter(df[feature1], df[feature2], c=dataset["target"])
plt.title("Class (ground truth)")
plt.xlabel(feature1)
plt.show()