KMeans clustering

Contents

KMeans clustering#

Intuition#

Practice#

import numpy as np
from sklearn.datasets import make_blobs

import plotly
plotly.offline.init_notebook_mode()

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from bokeh.plotting import figure, show
# from bokeh.sampledata.penguins import data
from bokeh.transform import factor_cmap, factor_mark
from bokeh.io import output_notebook
from bokeh.models import LinearAxis, Range1d, ColumnDataSource, HoverTool, LinearColorMapper, ColorBar

output_notebook(hide_banner=True)

def plot_kmeans_plotly(data, labels=None, centroids=None, regions=None, title=""):
    fig = go.Figure()
    if labels is None:
        x, y = data[:, 0], data[:, 1]
        fig.add_trace(go.Scatter(x=x, y=y, mode='markers', marker=dict(size=10, color='blue'), name="Data"))
    else:
        for i in np.unique(labels):
            fig.add_trace(
                go.Scatter(
                    x=data[labels == i, 0],
                    y=data[labels == i, 1],
                    mode='markers', 
                    marker=dict(size=10, color=px.colors.qualitative.Plotly[i]),
                    name=f"Cluster {i}"
                )
            )
    if centroids is not None:
        fig.add_trace(
            go.Scatter(
                x=centroids[:, 0], 
                y=centroids[:, 1], 
                mode='markers+text',
                name="Centroids",
                marker=dict(size=15, color='red', symbol='star', line=dict(color='black', width=2)),
            )
        )
        if regions is not None:
            x, y, Z = regions
            fig.add_trace(go.Heatmap(x=x, y=y, z=Z, colorscale='Rainbow', showscale=False, opacity=0.4))
            fig.update_xaxes(range=[x.min(), x.max()])
            fig.update_yaxes(range=[y.min(), y.max()])
            if title: title += " with Voronoi regions"
        
    # draw heatmap to indicates the distance between data points and centroids
    fig.update_layout(title=title, legend=dict(orientation="h", yanchor="bottom", y=1.0, xanchor="right", x=1))
    return fig

def plot_kmeans_bokeh(data, labels=None, centroids=None, regions=None, title=""):
    p = figure(sizing_mode='scale_width', aspect_ratio=2.4)
    x, y = data[:, 0], data[:, 1]
    
    if regions is not None:
        x_min, x_max = data[:, 0].min(), data[:, 0].max()
        y_min, y_max = data[:, 1].min(), data[:, 1].max()
        lower_bound = min(x_min, y_min) - 1
        upper_bound = max(x_max, y_max) + 1
        
        x_region, y_region, Z = regions
        p.image(
            image=[Z], x=lower_bound, y=lower_bound, dw=upper_bound-lower_bound, dh=upper_bound-lower_bound, global_alpha=0.36, color_mapper="Spectral4"
        )
        p.x_range.range_padding = p.y_range.range_padding = 0
    
    if labels is None:
        data_source = ColumnDataSource(data=dict(x=x, y=y))
        data_plot = p.scatter('x', 'y', source=data_source, size=10, color='blue', fill_alpha=0.4)
        p.add_tools(HoverTool(renderers=[data_plot], tooltips=[("x", "@x"), ("y", "@y")]))
    else:
        for i in np.unique(labels):
            x, y = data[labels == i, 0], data[labels == i, 1]
            source = ColumnDataSource(data=dict(x=x, y=y, desc=[f"Cluster {i}"]*len(x)))
            data_plot = p.scatter('x', 'y', source=source, size=10, color=px.colors.qualitative.Plotly[i], legend_label=f"Cluster {i}")
            p.add_tools(HoverTool(renderers=[data_plot], tooltips=[("x", "@x"), ("y", "@y"), ("desc", "@desc")]))
            p.legend.location = "top_left"
            p.legend.click_policy = "hide"
    
    if centroids is not None:
        centroids_source = ColumnDataSource(data=dict(x=centroids[:, 0], y=centroids[:, 1]))
        centroid_plot = p.scatter('x', 'y', source=centroids_source, size=16, color='red', marker='star', fill_alpha=0.64, line_color='black', line_width=2)
        p.add_tools(HoverTool(renderers=[centroid_plot], tooltips=[("x", "@x"), ("y", "@y")]))

    p.title.text = title
    show(p)

# Now, we create a sample dataset using `make_blobs` function from `sklearn.datasets`
X, y = make_blobs(n_samples=20, centers=3, random_state=42)

# # Plot the data points with plotly
# plot_kmeans_plotly(X, title="Data")

# Draw the data points with bokeh
plot_kmeans_bokeh(X, y, title="Data")

class KMeans:
    def __init__(self, n_clusters: int, n_iters: int, seed=42):
        self.n_clusters = n_clusters
        self.n_iters = n_iters
        self.data = None
        self.seed = seed
        
        self.inertia = None
        self.dunn_index = None


    def fit(self, X):
        assert len(X) >= self.n_clusters
        self.data = X
        
        centroids = self._init_centroids() # (k, n)
        old_centroids = None
        
        for _ in range(self.n_iters):
            distances = self._calculate_distance(self.data, centroids) # (m, k)
            labels = np.argmin(distances, axis=1) # (m)
            
            # update centroids
            old_centroids = centroids
            centroids = self._update_centroids(self.data, labels, self.n_clusters)
            if np.all(centroids - old_centroids < 1e-5):
                break
        
        self.centroids = centroids
        self.labels = labels
        
        self.inertia = self._calculate_inertia()
        self.dunn_index = self._calculate_dunn_index()

        return centroids

    def _init_centroids(self):
        np.random.seed(self.seed)
        ids = np.random.choice(np.arange(self.data.shape[0]), self.n_clusters, replace=False)
        return self.data[ids]


    def _calculate_distance(self, X, Y):
        # X in shape (m1, n) and Y in shape(m2, n)
        # distances in shape (m1, m2)
        X = X.reshape(X.shape[0], 1, X.shape[1])
        Y = Y.reshape(1, Y.shape[0], Y.shape[1])
        distances = np.sqrt(np.sum((X - Y) ** 2, axis=2))
        return distances


    def _update_centroids(self, X, labels, n_clusters):
        centroids = []
        for i in range(n_clusters):
            ids = labels == i
            centroid = np.mean(X[ids], axis=0)
            centroids.append(centroid)
        centroids = np.array(centroids)
        return centroids


    def _calculate_inertia(self):
        return sum([
            np.sum((x - self.centroids[l])**2) for l, x in zip(self.labels, self.data)
        ])


    def _calculate_dunn_index(self):
        
        max_intra_cluster_distance = 0.0
        for i in range(len(self.centroids)):
            centroid = self.centroids[i]
            X_cluster = self.data[self.labels == i]
            distances = np.linalg.norm(X_cluster - centroid, axis=1)
            max_intra_cluster_distance = max(max_intra_cluster_distance, np.max(distances))
            
        min_inter_cluster_distance = np.inf
        for i in range(len(self.centroids)):
            for j in range(i+1, len(self.centroids)):
                distance = np.linalg.norm(self.centroids[i] - self.centroids[j])
                min_inter_cluster_distance = min(min_inter_cluster_distance, distance)
        
        return min_inter_cluster_distance / max_intra_cluster_distance


    def get_inertia(self):
        return self.inertia


    def get_dunn_index(self):
        return self.dunn_index

kmeans = KMeans(n_clusters=4, n_iters=10, seed=42)
centroids = kmeans.fit(X)

data = X
x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1
y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1
lower_bound = min(x_min, y_min) - 1
upper_bound = max(x_max, y_max) + 1

x = np.linspace(lower_bound, upper_bound, 1000)
y = np.linspace(lower_bound, upper_bound, 1000)
xx, yy = np.meshgrid(x, y)
Z = kmeans._calculate_distance(np.c_[xx.ravel(), yy.ravel()], centroids).argmin(axis=1)
Z = Z.reshape(xx.shape)

# # Plot with Plotly
# plot_kmeans_plotly(X, centroids=centroids, regions=(x, y, Z), title="KMeans Clustering")

# Plot the data with bokeh
plot_kmeans_bokeh(X, centroids=centroids, regions=(x, y, Z), title="KMeans Clustering")

Elbow method#

The elbow method is a heuristic used in determining the number of clusters in a data set. The method consists of plotting the explained variation as a function of the number of clusters, and picking the elbow of the curve as the number of clusters to use.

inertia_values = []
dunn_index_values = []
min_clusters, max_clusters = 2, 10
for i in range(min_clusters, max_clusters+1):
    kmeans = KMeans(n_clusters=i, n_iters=10, seed=42)
    centroids = kmeans.fit(X)
    
    inertia = kmeans.get_inertia()
    inertia_values.append(inertia)
    
    dunn_index = kmeans.get_dunn_index()
    dunn_index_values.append(dunn_index)

# # Create figure with secondary y-axis
# fig = make_subplots(specs=[[{"secondary_y": True}]])
# fig.add_trace(
#     go.Scatter(x=np.arange(min_clusters, max_clusters+1), y=inertia_values, mode='lines+markers', name="Inertia")
# )
# fig.add_trace(
#     go.Scatter(x=np.arange(min_clusters, max_clusters+1), y=dunn_index_values, mode='lines+markers', name='Dunn Index'), 
#     secondary_y=True
# )
# fig.update_layout(
#     title="Inertia and Dunn Index of KMeans Clustering",
#     xaxis_title="Number of clusters", 
#     yaxis_title="Inertia",
#     yaxis2_title="Dunn Index",
#     legend=dict(orientation="h", yanchor="bottom", y=1.0, xanchor="right", x=1)
# )
# fig.show()

x = np.arange(min_clusters, max_clusters+1)
src = ColumnDataSource(data=dict(x=x, y1=inertia_values, y2=dunn_index_values))

p = figure(x_axis_label="#Clusters", y_axis_label="Inertia", sizing_mode='scale_width', aspect_ratio=2.5)

line1 = p.line('x', 'y1', source=src, line_width=2, color='blue', legend_label='Inertia')
p.y_range = Range1d(min(inertia_values) - 0.05*max(inertia_values), max(inertia_values)*1.1)

p.extra_y_ranges = {"y2": Range1d(start=min(dunn_index_values) - 0.05*max(dunn_index_values), end=max(dunn_index_values)*1.1)}
p.add_layout(LinearAxis(y_range_name="y2", axis_label="Dunn Index"), 'right')
line2 = p.line('x', 'y2', source=src, line_width=2, color='red', legend_label='Dunn Index', y_range_name="y2")


p.add_tools(HoverTool(tooltips=[("#Clusters", "@x"), ("Inertia", "@y1"), ("Dunn Index", "@y2")], mode="vline"))
p.legend.location = "top_right"
p.legend.click_policy="hide"

show(p)

By observing the plot of explained variation as a function of the number of clusters, we can see that the inertia decreases quickly at k=2 and k=3. After k=3, the decrease in inertia is much slower. Therefore, we can conclude that the optimal number of clusters is k=3.

If we use Dunn Index, the peak of the curve is at k=3 as well.

Finally, we plot the clusters and the centroids when k=3 as follows:

# Train the KMeans model with 3 clusters
kmeans = KMeans(n_clusters=3, n_iters=10, seed=42)
centroids = kmeans.fit(X)

# Get the regions of the data points
data = X
x_min, x_max = data[:, 0].min(), data[:, 0].max()
y_min, y_max = data[:, 1].min(), data[:, 1].max()
lower_bound = min(x_min, y_min) - 1
upper_bound = max(x_max, y_max) + 1

x = np.linspace(lower_bound, upper_bound, 1000)
y = np.linspace(lower_bound, upper_bound, 1000)
xx, yy = np.meshgrid(x, y)
Z = kmeans._calculate_distance(np.c_[xx.ravel(), yy.ravel()], centroids).argmin(axis=1)
Z = Z.reshape(xx.shape)

# # Plot the regions with Plotly
# plot_kmeans_plotly(X, centroids=centroids, title="KMeans Clusterings with 3 clusters", regions=(x, y, Z))


# # Plot the regions with Bokeh
plot_kmeans_bokeh(X, centroids=centroids, regions=(x, y, Z), title="KMeans Clusterings with 3 clusters")

References#

Appendix#

Calculating the distance pairwisely between two collections of vectors with vectorization#

Non-vectorized version

%%timeit -n 1000

a = np.array([[1, 1], [2, 2], [3, 3]])
b = np.array([[4, 4], [5, 5]])

distances = np.zeros((len(a), len(b)))
for i in range(len(a)):
    for j in range(len(b)):
        distances[i][j] = np.sqrt(np.sum((a[i] - b[j])**2))
distances

35.1 μs ± 1.04 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

Vectorized version

%%timeit -n 1000

# a in shape (m, n) and b in shape (k, n)
a = np.array([[1, 1], [2, 2], [3, 3]])
b = np.array([[4, 4], [5, 5]])

# after reshape, a in shape (m, 1, n) and b in shape (1, k, n)
a = a.reshape(a.shape[0], 1, a.shape[1])
b = b.reshape(1, b.shape[0], b.shape[1])

distances = np.sqrt(np.sum((a - b) ** 2, axis=2))
distances

9.42 μs ± 350 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)