\(k\)-Nearest-Neighbors#
Author: Bjarne C. Hiller
Ingredients#
Math
Vector Norms and Distance Metrics
NumPy
np.argsort
np.argmax
np.bincount
np.apply_along_axis
scikit-learn
Keypoints of KNNs#
We will use the Euclidean Distance, which describes the length of a line segment connecting the 2 points:
\[
d(\mathbf{x}, \mathbf{y}) = \lVert \mathbf{x} - \mathbf{y} \rVert_2 = \sqrt{\sum_{i=0}^d (x_i - y_i)^2}
\]
Euclidean distance is good, when features are measuring similar things. Use Normalization to ensure, that the distances between features is equal.
A \(k\)-NN estimator is a non-parametric model.
instance-based or memory-based learning
Implementation from Scratch#
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
import numpy as np
from sklearn.inspection import DecisionBoundaryDisplay
import numpy as np
A = np.random.randn(6,2)
B = np.random.randn(4,2)
(A[:, np.newaxis, :] - B).shape
(6, 4, 2)
def distance_matrix(A, B):
"""
Computes pairwise euclidean distances between points.
:param A: point array (n, d)
:param B: point array (m, d)
:returns: pairwise distance matrix (n,m)
"""
# repeat A m times to receive (n,m,d)
A = A[:, np.newaxis, :]
# numpy broadcasting magic!
D = A - B
# Euclidean Distance
D = np.power(D, 2)
D = np.sum(D, axis=-1)
D = np.sqrt(D)
return D
D = distance_matrix(A, B)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit_transform(["apple", "orange", "pear", "apple"])
array([0, 1, 2, 0])
rank = np.argsort(D, axis=1)[:, :1]
y = np.array(["A", "B", "C", "D", "E", "F"])
y[rank]
array([['D'],
['A'],
['A'],
['D'],
['A'],
['A']], dtype='<U1')
from sklearn.base import BaseEstimator, ClassifierMixin
class KNN(BaseEstimator, ClassifierMixin):
def __init__(self, k=1):
self.k = k
# sklearn convention: fields ending on underscores (_) are computed during fit
self.X_train_ = None
self.y_train_ = None
self.classes_ = None
self.le_ = None
def fit(self, X, y):
# store training data
self.X_train_ = X
self.y_train_ = y
self.le_ = LabelEncoder()
self.le_.fit(y)
self.classes_ = self.le_.classes_
def _predict(self, X):
"""For m points, returns label counts (m,c) over c classes among the k nearest neighbors."""
# compute pairwise distances between train and test
D = distance_matrix(X, self.X_train_)
# get sorted indices
rank = np.argsort(D, axis=1)
# use only top k ranks
rank = rank[:, :self.k]
# get labels (n,k) associated with k closest train points
y = self.y_train_[rank]
# for k>1, we need to count label occurences
y = np.apply_along_axis(self.le_.transform, 1, y)
# count label occurences along axis 1
n = len(self.le_.classes_)
counts = np.apply_along_axis(np.bincount, 1, y, minlength=n)
return counts
def predict(self, X):
"""Most frequent label among k nearest neighbors."""
counts = self._predict(X)
prediction = np.argmax(counts, axis=1)
prediction = self.le_.inverse_transform(prediction)
return prediction
def predict_proba(self, X):
"""Relative label frequencies among k nearest neighbors."""
counts = self._predict(X)
return counts / np.sum(counts, axis=1, keepdims=True)
Visualize Decision Boundaries#
For \(k=1\), the decision boundary of the KNN presents a Voronoi partition of the feature space given the training points:
np.random.seed(19)
n = 50
X = np.random.uniform(0, 1, size=(n,2))
y = np.random.choice([1, 2, 3], n)
fig, ax = plt.subplots()
model = KNN(k=1)
model.fit(X, y)
DecisionBoundaryDisplay.from_estimator(model, X, ax=ax)
plt.scatter(X[:,0], X[:,1], c=y, edgecolors=["k"])
plt.xlim(0,1)
plt.ylim(0,1)
plt.title("Voronoi Diagram");
Test on Iris Dataset#
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
iris_ds = load_iris()
# use petal length and petal width as features
X = iris_ds["data"][:,[2,3]]
y = iris_ds["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=19)
from sklearn.metrics import accuracy_score
model = KNN(k=1)
model.fit(X_train, y_train)
y_hat = model.predict(X_test)
accuracy_score(y_test, y_hat)
1.0
import matplotlib.pyplot as plt
from sklearn.inspection import DecisionBoundaryDisplay
fig, ax = plt.subplots()
display = DecisionBoundaryDisplay.from_estimator(model, X_test, ax=ax)
ax.scatter(X_train[:,0], X_train[:,1], c=y_train, edgecolors="k")
plt.show()
Curse of Dimensionality#
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
scores = []
for d in range(4, 200):
X, y = make_classification(n_samples=100, n_classes=2, n_features=d, n_informative=2, n_redundant=0, n_repeated=0, random_state=19)
# model = KNeighborsClassifier(n_neighbors=5)
model = KNN(k=5)
score = cross_val_score(model, X, y)
scores.append(score)
scores = np.stack(scores)
mu = scores.mean(axis=1)
sigma = scores.std(axis=1)
plt.fill_between(x=np.arange(len(mu)), y1=mu-sigma, y2=mu+sigma, alpha=0.2)
plt.plot(scores.mean(axis=1))
plt.xlabel("Dimensions")
plt.ylabel("Accuracy");