Skip to article frontmatterSkip to article content

Evaluation of Clusters

City University of Hong Kong
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.optimize import linear_sum_assignment
from sklearn import datasets, preprocessing
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.pipeline import make_pipeline

%matplotlib widget

In this notebook, we will consider different methods of evaluating clustering solutions.

Intrinsic measure

The intrinsic measures of cluster quality are helpful when the ground truth is unavailable or should not be used. For example, to determine the number of clusters, we can compare the intrinsic measures of the clustering solutions for different numbers of clusters.

Elbow Method

The following diagram shows the KnowledgeFlow interface of Weka that applies SimpleKMeans to the iris.2D.arff dataset for different choices of kk.

Elbow method

Implement the above using Weka.

df_WSS = pd.DataFrame(columns=["k", "WSS"])
df_WSS["k"] = np.arange(1, 5, dtype=int)
df_WSS["WSS"] = df_WSS["WSS"].astype(float)
# YOUR CODE HERE
raise NotImplementedError()

# plot WSS as a function of k
fig, ax = plt.subplots(clear=True, figsize=(10, 10), layout="constrained", num=1)
df_WSS.plot(x="k", y="WSS", xlabel=r"$k$", ylabel="WSS", legend=False, ax=ax)
# plt.xlabel("k")
# plt.ylabel("WSS")
plt.show()
df_WSS

Silhouette analysis

An alternative method to the elbow method is to compute the silhouette coefficient below:

We will use the existing implementation in sklearn. First, import the iris dataset from sklearn.datasets and store it as a DataFrame:

# load the dataset from sklearn
dataset = datasets.load_iris()

# create a DataFrame to help further analysis
df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
df["target"] = dataset.target
df.target = df.target.astype("category")
df.target = df.target.cat.rename_categories(dataset.target_names)
df

To cluster the data using kk-means clustering:

kmeans_minmax_normalized = make_pipeline(
    preprocessing.MinMaxScaler(), KMeans(n_clusters=3, random_state=0)
)
kmeans_minmax_normalized

feature1, feature2 = "petal length (cm)", "petal width (cm)"
labels = kmeans_minmax_normalized.fit_predict(df[[feature1, feature2]])

plt.figure(figsize=(10, 5), clear=True, num=2)

_ = plt.subplot(121, title="Cluster assignment", xlabel=feature1, ylabel=feature2)
plt.scatter(df[feature1], df[feature2], c=labels)

plt.subplot(122, title="Class (ground truth)", xlabel=feature1, sharey=_)
plt.scatter(df[feature1], df[feature2], c=dataset["target"])

plt.show()
df_silouette = pd.DataFrame(columns=["k", "s"])
df_silouette["k"] = np.arange(2, 11, dtype=int)
df_silouette["s"] = df_silouette["s"].astype(float)
for i in range(len(df_silouette)):
    labels_ = make_pipeline(
        preprocessing.MinMaxScaler(),
        KMeans(n_clusters=df_silouette.loc[i, "k"], random_state=0),
    ).fit_predict(df[[feature1, feature2]])
    # YOUR CODE HERE
    raise NotImplementedError()

# plot WSS as a function of k
fig, ax = plt.subplots(clear=True, figsize=(10, 10), layout="constrained", num=4)
df_silouette.plot(x="k", y="s", ax=ax)
plt.xlabel("k")
plt.ylabel("silouette score")
plt.show()
df_silouette

Extrinsic measures of cluster quality

Suppose L(p)L(\M{p}) and C(p)C(\M{p}) are the class and cluster labels of each sample pD\M{p}\in D. An extrinsic measure compares how similar the corresponding partitions {Li}\Set{L_i} and {Ci}\Set{C_i} are, where

Li:={pL(p)=i}Ci:={pC(p)=i}.\begin{align} L_i &:=\Set{\M{p}|L(\M{p})=i}\\ C_i &:=\Set{\M{p}|C(\M{p})=i}. \end{align}

Pairwise correctness

Define the indicator of correct clustering for a pair p,qD\M{p}, \M{q}\in D as

correctness(p,q):={1L(p)=L(q)    C(p)=C(q)0otherwise.\begin{align} \op{correctness}(\M{p},\M{q}) := \begin{cases} 1 & L(\M{p})=L(\M{q}) \iff C(\M{p})=C(\M{q})\\ 0 & \text{otherwise.} \end{cases} \end{align}

In other words, the clustering for the pair of samples is correct when

  • the samples have equal class labels and equal cluster labels, or
  • the samples have different class labels and different cluster labels.

The following function returns a boolean matrix of [correctness(p,q)][\op{correctness}(\M{p},\M{q})], with rows an columns indexed by p\M{p} and q\M{q} respectively.

def correctness(class_labels, cluster_labels):
    """Returns the pairwise correctness matrix for the class and cluster labels.

    Parameters:
    -----------
    class_labels (array): non-negative integer class labels for certain samples
    cluster_labels (array): corresponding non-negative integer cluster labels

    Returns:
    --------
    A matrix (2D array) of correctness(p,q) with rows and columns indexed by
    p and q, respectively, in the samples. correctness(p,q) indicates whether
      - p and q have equal class labels and equal cluster labels, or
      - p and q have different class labels and different cluster labels.
    """
    class_labels = np.asarray(class_labels)
    cluster_labels = np.asarray(cluster_labels)

    eq_class = class_labels.reshape(-1, 1) == class_labels.reshape(1, -1)
    eq_cluster = cluster_labels.reshape(-1, 1) == cluster_labels.reshape(1, -1)

    return (eq_class & eq_cluster) | ~(eq_class | eq_cluster)

For instance, consider the following class and cluster labels:

index iiclass label L(pi)L(\M{p}_i)cluster label C(pi)C(\M{p}_i)
001
101
210

The correctness matrix and the fraction correctly clustered pairs are:

correctness_matrix = correctness(class_labels=[0, 0, 1], cluster_labels=[1, 1, 0])
correctness_accuracy = correctness_matrix.mean()
print(f"Correctness matrix:\n {correctness_matrix}")
print(f"Accuracy: {correctness_matrix.mean():.3g}")
# YOUR CODE HERE
raise NotImplementedError()
correctness_accuracy
# YOUR CODE HERE
raise NotImplementedError()
correctness_accuracy

B-Cubed metrics

The accuracy computed from the pairwise correctness matrix can be misleading as it is dominated by true negatives. Similar to the class imbalance problem, precision and recall can be used instead.

For instance, consider the following class and cluster labels:

index iiclass label L(pi)L(\M{p}_i)cluster label C(pi)C(\M{p}_i)
000
101
212
312

The precision and recall for each point are

index iiprecision for pi\M{p}_irecall for pi\M{p}_i
010.5
110.5
211
311

and so the overall precision and recall are 1 and 0.75, respectively.

class BCubed:
    """Compute B-Cubed precision and recall.

    Parameters
    -----------
    class_labels: int array
        Non-negative integer class labels for certain samples.
    cluster_labels: int array
        Corresponding non-negative integer cluster labels.

    Attributes
    -----------
    precisions: array of float
        B-Cubed precisions for each sample.
    recalls: array of float
        B-Cubed recalls for each sample.
    precision: float
        Overall B-Cubed precision.
    recall: float
        Overall B-Cubed recall.
    """

    def __init__(self, class_labels, cluster_labels):
        self.class_labels = np.asarray(class_labels)
        self.cluster_labels = np.asarray(cluster_labels)

        eq_class = self.class_labels[:, None] == self.class_labels[None, :]
        eq_cluster = self.cluster_labels[:, None] == self.cluster_labels[None, :]

        TPs = (eq_class & eq_cluster).sum(axis=1)
        # YOUR CODE HERE
        raise NotImplementedError()
        self.precision = self.precisions.mean()
        self.recall = self.recalls.mean()

    def __repr__(self):
        return f"Precision: {self.precision:.3g}\n" + f"Recall: {self.recall:.3g}"
# tests
# simple case
bcubed_eval = BCubed(class_labels=[0, 0, 1, 1], cluster_labels=[0, 1, 2, 2])
assert np.isclose(bcubed_eval.precisions, [1, 1, 1, 1], rtol=1e-3).all()
assert np.isclose(bcubed_eval.recalls, [0.5, 0.5, 1, 1], rtol=1e-3).all()
# test on iris
bcubed_eval = BCubed(dataset["target"], labels)
print(bcubed_eval)
assert np.isclose(bcubed_eval.precision, 0.9252136752136751, rtol=1e-3)
assert np.isclose(bcubed_eval.recall, 0.9253333333333335, rtol=1e-3)

Classes to Clusters Evaluation

Instead of using pairwise comparison, Weka uses the classes-to-clusters evaluation. The computation of accuracy can be cast as a linear sum assignment problem, also known as the maximum weight matching in bipartite graphs. More precisely:

It will be useful to compute a matrix of the counts nijn_{ij} of samples with class label ii as row index and cluster label jj as column index. The following function implements such computation.

def class_cluster_counts(class_labels, cluster_labels):
    """Returns the class-cluster count matrix with rows and columns indexed by
    class and cluster labels, respectively.

    Parameters:
    -----------
    class_labels (array): non-negative integer class labels for certain samples
    cluster_labels (array): corresponding non-negative integer cluster labels

    Returns:
    --------
    counts: a matrix of counts of samples with rows indexed by class labels and
            columns index by cluster labels.
    """
    counts = np.zeros((class_labels.max() + 1, cluster_labels.max() + 1), dtype=int)
    for i, j in np.column_stack((class_labels, cluster_labels)):
        counts[i, j] += 1
    return counts

For the kk-means clustering on the iris dataset, the matrix [nij][n_{ij}] of class-cluster counts is:

counts = class_cluster_counts(dataset["target"], labels)
df_counts = pd.DataFrame(counts)
df_counts

We can use linear_sum_assignment from scipy.optimize module to find the optimal assignment as follows:

from scipy.optimize import linear_sum_assignment
classes, clusters = linear_sum_assignment(counts, maximize=True)

The following highlights the optimal assignment on the class-cluster count matrix.

def highlight(data):
    attr = "background-color: yellow"
    return pd.DataFrame(
        [
            [attr if i == j else "" for j in range(len(data.columns))]
            for i in range(len(data.index))
        ],
        index=data.index,
        columns=data.columns,
    )


df_counts.style.apply(highlight, axis=None, subset=(classes, clusters))
class Classes2ClustersEvaluation:
    """Classes-to-clusters evaluation.

    Parameters:
    -----------
    class_labels: array
        Non-negative integer class labels for certain samples.
    cluster_labels: array
        Corresponding non-negative integer cluster labels.

    Attributes:
    -----------
    accuracy: float
        fraction of correctly clustered instances.
    classes: int array
        Assigned class labels sorted in ascending order.
    clusters: int array
        Cluster labels corresponding to the assigned class labels.
    """

    def __init__(self, class_labels, cluster_labels):
        self.class_labels = np.asarray(class_labels)
        self.cluster_labels = np.asarray(cluster_labels)
        # YOUR CODE HERE
        raise NotImplementedError()

    def __repr__(self):
        s = f"Accuracy: {self.accuracy:.3g}\n"
        for i, j in np.column_stack((self.classes, self.clusters)):
            s += f"Class #{i} --> Cluster #{j}\n"
        return s
# tests
# simple case
c2c_eval = Classes2ClustersEvaluation(np.array([0, 0, 1, 1]), np.array([0, 1, 2, 2]))
assert (c2c_eval.classes == [0, 1]).all()
assert (c2c_eval.clusters == [0, 2]).all()
assert np.isclose(c2c_eval.accuracy, 0.75, rtol=1e-3)
# test on iris
c2c_eval = Classes2ClustersEvaluation(dataset["target"], labels)
print(c2c_eval)
assert np.isclose(c2c_eval.accuracy, 0.96, rtol=1e-3)