k-Nearest Neighbors (kNN)

k-Nearest Neighbors (kNN)#

Intuition#

Practice#

Import the required libraries, including numpy, pandas and plotly. We also use train_test_split from sklearn.model_selection to split the dataset into training and testing sets.

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from itertools import product

from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from bokeh.plotting import figure, show
from bokeh.models import (BasicTicker, ColumnDataSource, DataRange1d,
                          Grid, LassoSelectTool, LinearAxis, PanTool,
                          Plot, ResetTool, Scatter, WheelZoomTool)
from bokeh.models import HoverTool, Legend, ColumnDataSource
from bokeh.palettes import Category10_3
from bokeh.transform import factor_cmap

output_notebook()

Loading BokehJS ...

Dataset#

First, we load the dataset and display the first few rows of the dataset.

dataset = load_iris()
X = dataset.data
y = dataset.target
feature_names = dataset.feature_names
target_names = dataset.target_names

print(f'There are {len(X)} samples, each with {len(feature_names)} features as {feature_names}')
print(f'Targets are {target_names}')

There are 150 samples, each with 4 features as ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Targets are ['setosa' 'versicolor' 'virginica']

In this example, we are going to use the last two features of the iris dataset to illustrate the kNN algorithm.

Plotting the dataset with pairs of features#

Since we have 4 features, there are 6 possible pairs of features. Let’s plot them all.

Show code cell source Hide code cell source

# n_rows = 3
# n_cols = 2
# fig = make_subplots(rows=3, cols=2, subplot_titles=[f'{feature_names[i][:-5]} vs {feature_names[j][:-5]}' for i in range(len(feature_names)) for j in range(i+1, len(feature_names))])

# colors = ['red', 'green', 'blue']

# index = 0
# for i in range(len(feature_names)):
#     for j in range(i+1, len(feature_names)):
#         for target in range(len(target_names)):
#             X_sub = X[y == target]
#             fig.add_trace(
#                 go.Scatter(
#                     x=X_sub[:, i],
#                     y=X_sub[:, j],
#                     mode='markers',
#                     name=target_names[target],
#                     # legendgroup="group",
#                     marker_color=colors[target],
#                 ),
#                 row=int(index/n_cols)+1,
#                 col=index%n_cols+1,
#             )
#         fig.update_xaxes(title_text=feature_names[i], row=int(index/n_cols)+1, col=index%n_cols+1)
#         fig.update_yaxes(title_text=feature_names[j], row=int(index/n_cols)+1, col=index%n_cols+1)
#         index += 1

# # remove duplicate legend
# names = set()
# fig.for_each_trace(
#     lambda trace:
#         trace.update(showlegend=False)
#         if (trace.name in names) else names.add(trace.name))

# # width and height of the plot
# fig.update_layout(
#     title_text='Iris dataset visualization',
#     showlegend=True,
#     width=800,
#     height=1000,
# )
# fig.update_layout(legend=dict(
#     orientation="h",
#     yanchor="bottom",
#     y=1.04,
#     xanchor="right",
#     x=1
# ))
# fig.show()

# Plotting with Bokeh
df = pd.DataFrame(X, columns=feature_names)
y_df = pd.DataFrame([target_names[i] for i in y], columns=['target'], dtype='str')

df = pd.concat([df, y_df], axis=1)
df.head()

MARKERS = ['hex', 'circle_x', 'triangle']
COLORS = Category10_3
TARGETS = [str(i) for i in sorted(df['target'].unique())]
FEATURES, NUM_FEATURES = feature_names, len(feature_names)

xdrs = [DataRange1d(bounds=None) for _ in range(len(feature_names))]
ydrs = [DataRange1d(bounds=None) for _ in range(len(feature_names))]

plots = []
for i, (y, x) in enumerate(product(FEATURES, reversed(FEATURES))):
    p = figure(
        title=f'{x} vs {y}',
        width=320,
        height=320,
        x_range=xdrs[i % NUM_FEATURES],
        y_range=ydrs[i // NUM_FEATURES],
    )
    legend_it = []
    for t_idx, t in enumerate(TARGETS):
        df_t = df[df['target'] == t]
        source = ColumnDataSource(df_t)
        scatter_plot = p.scatter(
            x=x,
            y=y,
            source=source,
            marker=MARKERS[t_idx],
            color=COLORS[t_idx],
            fill_alpha=0.4,
            size=8,
            name=t,
        )

        legend_it.append((t, [scatter_plot]))
        legend = Legend(items=legend_it, location='center', orientation='horizontal')
        legend.click_policy="hide"
        
        hover_tool_x, hover_tool_y = '@{' + x + '}', '@{' + y + '}'
        p.add_tools(HoverTool(tooltips=[(x, hover_tool_x), (y, hover_tool_y)], renderers=[scatter_plot]))

    p.add_layout(legend, 'below')
    p.add_tools(LassoSelectTool())
    p.xaxis.axis_label = x
    p.yaxis.axis_label = y
    plots.append(p)
    
    # suppress the diagonal
    if (i%NUM_FEATURES) + (i//NUM_FEATURES) == NUM_FEATURES-1:
        for glyph in p.renderers:
            glyph.visible = False 
        p.grid.grid_line_color = None
        p.legend.visible = False
        p.title.text = ''
        p.tools = []
        p.xaxis.visible = False
        p.yaxis.visible = False
        p.outline_line_color = None
    
    p.min_border_right = 8
    p.border_fill_color = 'white'

grid_plot = gridplot(plots, ncols=4)
show(grid_plot)

From the visualizations, we observe that the last two features (Petal Width and Petal Length) are the most promising to use in the kNN algorithm since they are the most separable.

X = dataset.data
y = dataset.target
X = X[:, :2]

Next, we split the dataset into a training set and a test set with 20 samples for the test set, and the rest for the training set.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=20, random_state=42)

class KNNClassifier:
    def __init__(self, k=3, distance='euclidean'):
        self.k = k
        self.distance = distance
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
    
    def calculate_distances(self, A, B):
        A = A.reshape(A.shape[0], 1, A.shape[1])
        B = B.reshape(1, B.shape[0], B.shape[1])
        if self.distance.lower() == 'euclidean':
            return np.sqrt(np.sum((A - B)**2, axis=2))
        elif self.distance.lower() == 'manhattan':
            return np.sum(np.abs(A - B), axis=2)
        else:
            raise ValueError('Unknown distance function')
    
    def predict(self, X):
        y_pred = []
        m, _  = X.shape
        
        # calculate distances
        distances = self.calculate_distances(X, self.X_train)
        
        # find k nearest neighbors
        nearest_indices = np.argsort(distances, axis=1)[:, :self.k]
        nearest_labels = self.y_train[nearest_indices]
        
        # find the most common class for each sample
        for i in range(m):
            unique, counts = np.unique(nearest_labels[i], return_counts=True)
            pred = unique[np.argmax(counts)]
            y_pred.append(pred)
        
        return np.array(y_pred)

Now, we create a kNN classifier using euclidean distance as the distance metric with k=3. The classifier is trained using the training set and then used to predict the test set.

clf = KNNClassifier(k=3, distance='euclidean')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

Finally, the classifier is evaluated using the accuracy score and the confusion matrix.

accuracy = np.mean(y_pred == y_test)
print(f'Accuracy: {accuracy*100:.2f}%')

Accuracy: 80.00%

The confusion matrix indicates that the kNN algorithm with k=3 and euclidean distance has 80% accuracy on the test set. The setosa class is perfectly classified with a precision of 100%, while the versicolor and virginica classes are sometimes confused with each other.

Besides, we can plot the decision boundary of the kNN classifier in a heatmap.

# First, we create a meshgrid of points to classify
x_min, x_max = X[:, 0].min() - 2, X[:, 0].max() + 2
y_min, y_max = X[:, 1].min() - 2, X[:, 1].max() + 2
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

Show code cell source Hide code cell source

# # ref: https://stackoverflow.com/questions/45075638/graph-k-nn-decision-boundaries-in-matplotlib

# dot_colors = ['red', 'green', 'blue']
# colors = ['red', 'green', 'blue']

# # # Put the result into a color plot
# Z = Z.reshape(xx.shape)

# # now plot with plotly
# fig = go.Figure()

# fig.add_trace(
#     go.Heatmap(
#         z=Z,
#         x=np.arange(x_min, x_max, 0.01),
#         y=np.arange(y_min, y_max, 0.01),
#         colorscale=[[0, 'pink'], [0.5, 'lightgreen'], [1, 'lightblue']],
#         showscale=False,
#     )
# )

# for target in range(len(target_names)):
#     X_sub = X[y == target]
#     fig.add_trace(
#         go.Scatter(
#             x=X_sub[:, 0],
#             y=X_sub[:, 1],
#             mode='markers',
#             name=target_names[target],
#             marker_color=colors[target],
#         )
#     )
    
# fig.update_layout(title_text='Iris dataset visualization', showlegend=True, width=800, height=800)
# fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.0, xanchor="right", x=1))
# fig.show()


from bokeh.palettes import Bright6

p = figure(title='KNN decision boundary', width=800, height=800, tools="")

# Put the regions into a color plot
Z = Z.reshape(xx.shape)
region_source = ColumnDataSource(data=dict(x=xx.ravel(), y=yy.ravel(), image=Z.ravel()))
region_plot = p.image(image=[Z], x=x_min, y=y_min, dw=x_max-x_min, dh=y_max-y_min, alpha=0.4, palette=Bright6[:3])
p.add_tools(HoverTool(tooltips=[("x", "$x"), ("y", "$y"),], renderers=[region_plot]))

for target in range(len(target_names)):
    X_sub = X[y == target]
    scatter_plot = p.scatter(
        x=X_sub[:, 0],
        y=X_sub[:, 1],
        color=Bright6[target],
        legend_label=target_names[target],
    )
    p.add_tools(HoverTool(tooltips=[('x', '@x'), ('y', '@y'), ('class', f'{target_names[target]} ({target})')], renderers=[scatter_plot]))

p.x_range.range_padding = p.y_range.range_padding = 0
p.toolbar_location = None

show(p)

From the heatmap, the setosa class (red color) is the most separable from the other two classes. The decision boundary is clear and the kNN algorithm should be able to classify the setosa class with high accuracy.

The versicolor and virginica classes are not as separable as the setosa class. In the middle of the plot, there is a region where the two classes overlap. The kNN algorithm may not be able to classify these two classes with high accuracy.

kNN classifier with different k values#

accuracies = {}

for d in ['euclidean', 'manhattan']:
    accuracies[d] = []
    for k in range(1, 21):
        clf = KNNClassifier(k=k, distance=d)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = np.mean(y_pred == y_test)
        accuracies[d].append(accuracy)

# # Plot the accuracy per k with plotly
# fig = go.Figure()
# for d in ['euclidean', 'manhattan']:
#     fig.add_trace(go.Scatter(x=np.arange(1, 20), y=accuracies[d], mode='lines+markers', name=d))
# fig.update_layout(title_text='Accuracy per k', xaxis_title='k', yaxis_title='Accuracy', width=800, height=500)
# fig.show()

# Plot the accuracy per k with bokeh
from bokeh.palettes import Category10_3
p = figure(title='Accuracy per k', width=800, height=500)
for i, d in enumerate(['euclidean', 'manhattan']):
    p.line(x=np.arange(1, 21), y=accuracies[d], legend_label=d, line_width=2, color=Category10_3[i])
    point_plot = p.scatter(x=np.arange(1, 21), y=accuracies[d], color=Category10_3[i], size=8)
    p.add_tools(HoverTool(tooltips=[('k', '$x'), ('accuracy', '$y')], renderers=[point_plot]))
p.xaxis.axis_label = 'k'
p.yaxis.axis_label = 'Accuracy'
p.legend.location = 'bottom_right'
p.legend.click_policy = 'hide'
p.xaxis.ticker = BasicTicker(desired_num_ticks=20)
show(p)

From the line graph, using euclidean distance with k = 10 gets the highest accuracy score of 90% on the test set, while using k = {4, 15, 16, 17} obtains the lowest accuracy score of 75%.

For the manhattan distance, using k = 3 achieves highest accuracy score of 90%. But at k = 6, the accuracy score is the lowest at 70%.

Using kNN Classifier from `sklearn`#

The default kNN classifier from sklearn uses k = 5 and euclidean distance.

from sklearn.neighbors import KNeighborsClassifier

accuracies = {}

for i in range(len(['manhattan', 'euclidean'])):
    accuracies[i] = []
    for k in range(1, 21):
        clf = KNeighborsClassifier(n_neighbors=k, p=i+1)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = np.mean(y_pred == y_test)
        accuracies[i].append(accuracy)

# # Plot the accuracy per k with plotly
# fig = go.Figure()
# for i in range(len(['euclidean', 'manhattan'])):
#     fig.add_trace(go.Scatter(x=np.arange(1, 20), y=accuracies[i], mode='lines+markers', name=['euclidean', 'manhattan'][i]))
# fig.update_layout(title_text='Accuracy per k', xaxis_title='k', yaxis_title='Accuracy', width=800, height=500)
# fig.show()

# Plot the accuracy per k with bokeh
p = figure(title='Accuracy per k', width=800, height=500)
for i, d in enumerate(['euclidean', 'manhattan']):
    p.line(x=np.arange(1, 21), y=accuracies[i], legend_label=d, line_width=2, color=Category10_3[i])
    point_plot = p.scatter(x=np.arange(1, 21), y=accuracies[i], color=Category10_3[i], size=8)
    p.add_tools(HoverTool(tooltips=[('k', '$x'), ('accuracy', '$y')], renderers=[point_plot]))
p.xaxis.axis_label = 'k'
p.yaxis.axis_label = 'Accuracy'
p.legend.location = 'bottom_right'
p.legend.click_policy = 'hide'
p.xaxis.ticker = BasicTicker(desired_num_ticks=20)
show(p)