KMeans clustering#
Intuition#
Practice#
import numpy as np
from sklearn.datasets import make_blobs
import plotly
plotly.offline.init_notebook_mode()
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from bokeh.plotting import figure, show
# from bokeh.sampledata.penguins import data
from bokeh.transform import factor_cmap, factor_mark
from bokeh.io import output_notebook
from bokeh.models import LinearAxis, Range1d, ColumnDataSource, HoverTool, LinearColorMapper, ColorBar
output_notebook(hide_banner=True)
def plot_kmeans_plotly(data, labels=None, centroids=None, regions=None, title=""):
fig = go.Figure()
if labels is None:
x, y = data[:, 0], data[:, 1]
fig.add_trace(go.Scatter(x=x, y=y, mode='markers', marker=dict(size=10, color='blue'), name="Data"))
else:
for i in np.unique(labels):
fig.add_trace(
go.Scatter(
x=data[labels == i, 0],
y=data[labels == i, 1],
mode='markers',
marker=dict(size=10, color=px.colors.qualitative.Plotly[i]),
name=f"Cluster {i}"
)
)
if centroids is not None:
fig.add_trace(
go.Scatter(
x=centroids[:, 0],
y=centroids[:, 1],
mode='markers+text',
name="Centroids",
marker=dict(size=15, color='red', symbol='star', line=dict(color='black', width=2)),
)
)
if regions is not None:
x, y, Z = regions
fig.add_trace(go.Heatmap(x=x, y=y, z=Z, colorscale='Rainbow', showscale=False, opacity=0.4))
fig.update_xaxes(range=[x.min(), x.max()])
fig.update_yaxes(range=[y.min(), y.max()])
if title: title += " with Voronoi regions"
# draw heatmap to indicates the distance between data points and centroids
fig.update_layout(title=title, legend=dict(orientation="h", yanchor="bottom", y=1.0, xanchor="right", x=1))
return fig
def plot_kmeans_bokeh(data, labels=None, centroids=None, regions=None, title=""):
p = figure(sizing_mode='scale_width', aspect_ratio=2.4)
x, y = data[:, 0], data[:, 1]
if regions is not None:
x_min, x_max = data[:, 0].min(), data[:, 0].max()
y_min, y_max = data[:, 1].min(), data[:, 1].max()
lower_bound = min(x_min, y_min) - 1
upper_bound = max(x_max, y_max) + 1
x_region, y_region, Z = regions
p.image(
image=[Z], x=lower_bound, y=lower_bound, dw=upper_bound-lower_bound, dh=upper_bound-lower_bound, global_alpha=0.36, color_mapper="Spectral4"
)
p.x_range.range_padding = p.y_range.range_padding = 0
if labels is None:
data_source = ColumnDataSource(data=dict(x=x, y=y))
data_plot = p.scatter('x', 'y', source=data_source, size=10, color='blue', fill_alpha=0.4)
p.add_tools(HoverTool(renderers=[data_plot], tooltips=[("x", "@x"), ("y", "@y")]))
else:
for i in np.unique(labels):
x, y = data[labels == i, 0], data[labels == i, 1]
source = ColumnDataSource(data=dict(x=x, y=y, desc=[f"Cluster {i}"]*len(x)))
data_plot = p.scatter('x', 'y', source=source, size=10, color=px.colors.qualitative.Plotly[i], legend_label=f"Cluster {i}")
p.add_tools(HoverTool(renderers=[data_plot], tooltips=[("x", "@x"), ("y", "@y"), ("desc", "@desc")]))
p.legend.location = "top_left"
p.legend.click_policy = "hide"
if centroids is not None:
centroids_source = ColumnDataSource(data=dict(x=centroids[:, 0], y=centroids[:, 1]))
centroid_plot = p.scatter('x', 'y', source=centroids_source, size=16, color='red', marker='star', fill_alpha=0.64, line_color='black', line_width=2)
p.add_tools(HoverTool(renderers=[centroid_plot], tooltips=[("x", "@x"), ("y", "@y")]))
p.title.text = title
show(p)
# Now, we create a sample dataset using `make_blobs` function from `sklearn.datasets`
X, y = make_blobs(n_samples=20, centers=3, random_state=42)
# # Plot the data points with plotly
# plot_kmeans_plotly(X, title="Data")
# Draw the data points with bokeh
plot_kmeans_bokeh(X, y, title="Data")
class KMeans:
def __init__(self, n_clusters: int, n_iters: int, seed=42):
self.n_clusters = n_clusters
self.n_iters = n_iters
self.data = None
self.seed = seed
self.inertia = None
self.dunn_index = None
def fit(self, X):
assert len(X) >= self.n_clusters
self.data = X
centroids = self._init_centroids() # (k, n)
old_centroids = None
for _ in range(self.n_iters):
distances = self._calculate_distance(self.data, centroids) # (m, k)
labels = np.argmin(distances, axis=1) # (m)
# update centroids
old_centroids = centroids
centroids = self._update_centroids(self.data, labels, self.n_clusters)
if np.all(centroids - old_centroids < 1e-5):
break
self.centroids = centroids
self.labels = labels
self.inertia = self._calculate_inertia()
self.dunn_index = self._calculate_dunn_index()
return centroids
def _init_centroids(self):
np.random.seed(self.seed)
ids = np.random.choice(np.arange(self.data.shape[0]), self.n_clusters, replace=False)
return self.data[ids]
def _calculate_distance(self, X, Y):
# X in shape (m1, n) and Y in shape(m2, n)
# distances in shape (m1, m2)
X = X.reshape(X.shape[0], 1, X.shape[1])
Y = Y.reshape(1, Y.shape[0], Y.shape[1])
distances = np.sqrt(np.sum((X - Y) ** 2, axis=2))
return distances
def _update_centroids(self, X, labels, n_clusters):
centroids = []
for i in range(n_clusters):
ids = labels == i
centroid = np.mean(X[ids], axis=0)
centroids.append(centroid)
centroids = np.array(centroids)
return centroids
def _calculate_inertia(self):
return sum([
np.sum((x - self.centroids[l])**2) for l, x in zip(self.labels, self.data)
])
def _calculate_dunn_index(self):
max_intra_cluster_distance = 0.0
for i in range(len(self.centroids)):
centroid = self.centroids[i]
X_cluster = self.data[self.labels == i]
distances = np.linalg.norm(X_cluster - centroid, axis=1)
max_intra_cluster_distance = max(max_intra_cluster_distance, np.max(distances))
min_inter_cluster_distance = np.inf
for i in range(len(self.centroids)):
for j in range(i+1, len(self.centroids)):
distance = np.linalg.norm(self.centroids[i] - self.centroids[j])
min_inter_cluster_distance = min(min_inter_cluster_distance, distance)
return min_inter_cluster_distance / max_intra_cluster_distance
def get_inertia(self):
return self.inertia
def get_dunn_index(self):
return self.dunn_index
kmeans = KMeans(n_clusters=4, n_iters=10, seed=42)
centroids = kmeans.fit(X)
data = X
x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1
y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1
lower_bound = min(x_min, y_min) - 1
upper_bound = max(x_max, y_max) + 1
x = np.linspace(lower_bound, upper_bound, 1000)
y = np.linspace(lower_bound, upper_bound, 1000)
xx, yy = np.meshgrid(x, y)
Z = kmeans._calculate_distance(np.c_[xx.ravel(), yy.ravel()], centroids).argmin(axis=1)
Z = Z.reshape(xx.shape)
# # Plot with Plotly
# plot_kmeans_plotly(X, centroids=centroids, regions=(x, y, Z), title="KMeans Clustering")
# Plot the data with bokeh
plot_kmeans_bokeh(X, centroids=centroids, regions=(x, y, Z), title="KMeans Clustering")
Elbow method#
The elbow method is a heuristic used in determining the number of clusters in a data set. The method consists of plotting the explained variation as a function of the number of clusters, and picking the elbow of the curve as the number of clusters to use.
inertia_values = []
dunn_index_values = []
min_clusters, max_clusters = 2, 10
for i in range(min_clusters, max_clusters+1):
kmeans = KMeans(n_clusters=i, n_iters=10, seed=42)
centroids = kmeans.fit(X)
inertia = kmeans.get_inertia()
inertia_values.append(inertia)
dunn_index = kmeans.get_dunn_index()
dunn_index_values.append(dunn_index)
# # Create figure with secondary y-axis
# fig = make_subplots(specs=[[{"secondary_y": True}]])
# fig.add_trace(
# go.Scatter(x=np.arange(min_clusters, max_clusters+1), y=inertia_values, mode='lines+markers', name="Inertia")
# )
# fig.add_trace(
# go.Scatter(x=np.arange(min_clusters, max_clusters+1), y=dunn_index_values, mode='lines+markers', name='Dunn Index'),
# secondary_y=True
# )
# fig.update_layout(
# title="Inertia and Dunn Index of KMeans Clustering",
# xaxis_title="Number of clusters",
# yaxis_title="Inertia",
# yaxis2_title="Dunn Index",
# legend=dict(orientation="h", yanchor="bottom", y=1.0, xanchor="right", x=1)
# )
# fig.show()
x = np.arange(min_clusters, max_clusters+1)
src = ColumnDataSource(data=dict(x=x, y1=inertia_values, y2=dunn_index_values))
p = figure(x_axis_label="#Clusters", y_axis_label="Inertia", sizing_mode='scale_width', aspect_ratio=2.5)
line1 = p.line('x', 'y1', source=src, line_width=2, color='blue', legend_label='Inertia')
p.y_range = Range1d(min(inertia_values) - 0.05*max(inertia_values), max(inertia_values)*1.1)
p.extra_y_ranges = {"y2": Range1d(start=min(dunn_index_values) - 0.05*max(dunn_index_values), end=max(dunn_index_values)*1.1)}
p.add_layout(LinearAxis(y_range_name="y2", axis_label="Dunn Index"), 'right')
line2 = p.line('x', 'y2', source=src, line_width=2, color='red', legend_label='Dunn Index', y_range_name="y2")
p.add_tools(HoverTool(tooltips=[("#Clusters", "@x"), ("Inertia", "@y1"), ("Dunn Index", "@y2")], mode="vline"))
p.legend.location = "top_right"
p.legend.click_policy="hide"
show(p)
By observing the plot of explained variation as a function of the number of clusters, we can see that the inertia decreases quickly at k=2
and k=3
. After k=3
, the decrease in inertia is much slower. Therefore, we can conclude that the optimal number of clusters is k=3
.
If we use Dunn Index, the peak of the curve is at k=3
as well.
Finally, we plot the clusters and the centroids when k=3
as follows:
# Train the KMeans model with 3 clusters
kmeans = KMeans(n_clusters=3, n_iters=10, seed=42)
centroids = kmeans.fit(X)
# Get the regions of the data points
data = X
x_min, x_max = data[:, 0].min(), data[:, 0].max()
y_min, y_max = data[:, 1].min(), data[:, 1].max()
lower_bound = min(x_min, y_min) - 1
upper_bound = max(x_max, y_max) + 1
x = np.linspace(lower_bound, upper_bound, 1000)
y = np.linspace(lower_bound, upper_bound, 1000)
xx, yy = np.meshgrid(x, y)
Z = kmeans._calculate_distance(np.c_[xx.ravel(), yy.ravel()], centroids).argmin(axis=1)
Z = Z.reshape(xx.shape)
# # Plot the regions with Plotly
# plot_kmeans_plotly(X, centroids=centroids, title="KMeans Clusterings with 3 clusters", regions=(x, y, Z))
# # Plot the regions with Bokeh
plot_kmeans_bokeh(X, centroids=centroids, regions=(x, y, Z), title="KMeans Clusterings with 3 clusters")
References#
K-Means:
Plotting with Bokeh:
Hiding lines: https://docs.bokeh.org/en/latest/docs/user_guide/interaction/legends.html#hiding-glyphs
Color palette for this page: https://docs.bokeh.org/en/latest/docs/examples/basic/data/transform_markers.html
https://www.geeksforgeeks.org/creating-a-chart-with-two-different-y-axis-ranges-in-bokeh/
https://stackoverflow.com/questions/25199665/one-chart-with-two-different-y-axis-ranges-in-bokeh
Appendix#
Calculating the distance pairwisely between two collections of vectors with vectorization#
Non-vectorized version
%%timeit -n 1000
a = np.array([[1, 1], [2, 2], [3, 3]])
b = np.array([[4, 4], [5, 5]])
distances = np.zeros((len(a), len(b)))
for i in range(len(a)):
for j in range(len(b)):
distances[i][j] = np.sqrt(np.sum((a[i] - b[j])**2))
distances
35.1 μs ± 1.04 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
Vectorized version
%%timeit -n 1000
# a in shape (m, n) and b in shape (k, n)
a = np.array([[1, 1], [2, 2], [3, 3]])
b = np.array([[4, 4], [5, 5]])
# after reshape, a in shape (m, 1, n) and b in shape (1, k, n)
a = a.reshape(a.shape[0], 1, a.shape[1])
b = b.reshape(1, b.shape[0], b.shape[1])
distances = np.sqrt(np.sum((a - b) ** 2, axis=2))
distances
9.42 μs ± 350 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)