import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from ipywidgets import interact
from sklearn import datasets, preprocessing
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from util import plot_cluster_regions
%matplotlib widgetRecall that the classification problem for the iris dataset is to classify the iris species based on the lengths and widths of the petals and sepals. In this notebook, we will cluster the instances in the iris dataset with the class attribute removed. The purpose is to evaluate whether a partitional clustering algorithm can identify the 3 different iris species without looking at the class attribute.
Clustering using Weka¶
The Explorer interface has a Cluster panel for clustering. Follow the procedures below to cluster the iris.2D dataset:
- Using the
Preprocesspanel, loadiris.2D.arfffrom the Weka data folder. - Using the
Clusterpanel, choose theClustererasSimpleKMeans, which implements the -means clustering algorithm. - The default number of clusters is (k=2). Change it to (k=3) instead, i.e., set
numClustersto 3. - Click the
ignore attributesbutton belowCluster modeand selectclass. - Click
Startto run the clustering algorithm.
# YOUR CODE HERE
raise NotImplementedError()
wssdf_centroids = pd.DataFrame(columns=["petallength", "petalwidth"], dtype=float)
# YOUR CODE HERE
raise NotImplementedError()
df_centroidsYOUR ANSWER HERE
YOUR ANSWER HERE
# YOUR CODE HERE
raise NotImplementedError()
error_rateYOUR ANSWER HERE
Clustering using scikit-learn¶
We first import the iris dataset from sklearn.datasets and store it as a DataFrame.
# load the dataset from sklearn
dataset = datasets.load_iris()
# create a DataFrame to help further analysis
df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
df["target"] = dataset.target
df.target = df.target.astype("category")
df.target = df.target.cat.rename_categories(dataset.target_names)
df # display an overview of the dataTo normalize the features followed by -means clustering, we create a pipeline as follows:
from sklearn.cluster import KMeanskmeans_minmax_normalized = make_pipeline(
preprocessing.MinMaxScaler(), KMeans(n_clusters=3)
)
kmeans_minmax_normalizedTo cluster based on the two input features petal length (cm) and petal width (cm):
feature1, feature2 = "petal length (cm)", "petal width (cm)"
kmeans_minmax_normalized.fit(df[[feature1, feature2]])
plt.figure(num=1, figsize=(10, 5))
plt.subplot(121)
plt.scatter(
df[feature1],
df[feature2],
c=kmeans_minmax_normalized.predict(df[[feature1, feature2]]),
)
plt.title("Cluster assignment")
plt.xlabel(feature1)
plt.ylabel(feature2)
plt.subplot(122)
plt.scatter(df[feature1], df[feature2], c=dataset["target"])
plt.title("Class (ground truth)")
plt.xlabel(feature1)
plt.show()Since clustering is unsupervised, unlike classification,
- the
fitmethod ofkmeans_minmax_normalizeddoes not take the target attribute as an argument, and - the
predictmethod returns cluster labels that may not be associated with the class labels.
Further details can be obtained from the fitted properties (those with a trailing underscore):
kmeans = kmeans_minmax_normalized["kmeans"]
print("Cluster labels:", *kmeans.labels_)
print("Cluster centers:", *kmeans.cluster_centers_)
print("WSS:", kmeans.inertia_)
print("# iterations:", kmeans.n_iter_)Similar to plotting the decision regions for a classifier, we provide the function plot_cluster_regions in util.py to plot the cluster regions for a clusterer.
from util import plot_cluster_regionshelp(plot_cluster_regions)if input("Execute? [y/N]").lower() == "y":
fig, ax = plt.subplots(nrows=1, ncols=1, clear=True, figsize=(10, 10), layout="constrained", num=2, sharey=True)
@interact(
normalization=["None", "Min-max", "Standard"],
feature1=dataset.feature_names,
feature2=dataset.feature_names,
k=widgets.IntSlider(3, 1, 4, continuous_update=False),
resolution=widgets.IntSlider(1, 1, 4, continuous_update=False),
)
def cluster_regions_kmeans(
normalization,
feature1=dataset.feature_names[2],
feature2=dataset.feature_names[3],
k=3,
resolution=1,
):
scaler = {
"Min-max": preprocessing.MinMaxScaler,
"Standard": preprocessing.StandardScaler,
}
# YOUR CODE HERE
raise NotImplementedError()
ax.clear()
plot_cluster_regions(
df[[feature1, feature2]], df.target, kmeans, N=resolution * 100,
ax=ax
)
ax.set_title("Cluster regions for k-means clustering")
ax.set_xlabel(feature1)
ax.set_ylabel(feature2)
plt.show()