\(k\)-Nearest-Neighbors#

Author: Bjarne C. Hiller

Ingredients#

Keypoints of KNNs#

We will use the Euclidean Distance, which describes the length of a line segment connecting the 2 points:

\[ d(\mathbf{x}, \mathbf{y}) = \lVert \mathbf{x} - \mathbf{y} \rVert_2 = \sqrt{\sum_{i=0}^d (x_i - y_i)^2} \]

Euclidean distance is good, when features are measuring similar things. Use Normalization to ensure, that the distances between features is equal.

A \(k\)-NN estimator is a non-parametric model.

instance-based or memory-based learning

Implementation from Scratch#

from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
import numpy as np
from sklearn.inspection import DecisionBoundaryDisplay
import numpy as np

A = np.random.randn(6,2)
B = np.random.randn(4,2)

(A[:, np.newaxis, :] - B).shape
(6, 4, 2)
def distance_matrix(A, B):
    """
    Computes pairwise euclidean distances between points.

    :param A: point array (n, d)
    :param B: point array (m, d)
    :returns: pairwise distance matrix (n,m) 
    """
    # repeat A m times to receive (n,m,d)
    A = A[:, np.newaxis, :]

    # numpy broadcasting magic!
    D = A - B

    # Euclidean Distance
    D = np.power(D, 2)
    D = np.sum(D, axis=-1)
    D = np.sqrt(D)
    return D
    
D = distance_matrix(A, B)
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit_transform(["apple", "orange", "pear", "apple"])
array([0, 1, 2, 0])
rank = np.argsort(D, axis=1)[:, :1]

y = np.array(["A", "B", "C", "D", "E", "F"])
y[rank]
array([['D'],
       ['A'],
       ['A'],
       ['D'],
       ['A'],
       ['A']], dtype='<U1')
from sklearn.base import BaseEstimator, ClassifierMixin

class KNN(BaseEstimator, ClassifierMixin):
    def __init__(self, k=1):
        self.k = k
        # sklearn convention: fields ending on underscores (_) are computed during fit
        self.X_train_ = None
        self.y_train_ = None
        self.classes_ = None
        self.le_ = None
    
    def fit(self, X, y):
        # store training data
        self.X_train_ = X
        self.y_train_ = y
        self.le_ = LabelEncoder()
        self.le_.fit(y)
        self.classes_ = self.le_.classes_
 
    def _predict(self, X):
        """For m points, returns label counts (m,c) over c classes among the k nearest neighbors."""
        # compute pairwise distances between train and test
        D = distance_matrix(X, self.X_train_)

        # get sorted indices
        rank = np.argsort(D, axis=1)

        # use only top k ranks
        rank = rank[:, :self.k]

        # get labels (n,k) associated with k closest train points
        y = self.y_train_[rank]

        # for k>1, we need to count label occurences
        y = np.apply_along_axis(self.le_.transform, 1, y)

        # count label occurences along axis 1
        n = len(self.le_.classes_)
        counts = np.apply_along_axis(np.bincount, 1, y, minlength=n)
        return counts
    
    def predict(self, X):
        """Most frequent label among k nearest neighbors."""
        counts = self._predict(X)
        prediction = np.argmax(counts, axis=1)
        prediction = self.le_.inverse_transform(prediction)
        return prediction
    
    def predict_proba(self, X):
        """Relative label frequencies among k nearest neighbors."""
        counts = self._predict(X)
        return counts / np.sum(counts, axis=1, keepdims=True)

Visualize Decision Boundaries#

For \(k=1\), the decision boundary of the KNN presents a Voronoi partition of the feature space given the training points:

np.random.seed(19)

n = 50


X = np.random.uniform(0, 1, size=(n,2))
y = np.random.choice([1, 2, 3], n)

fig, ax = plt.subplots()

model = KNN(k=1)
model.fit(X, y)

DecisionBoundaryDisplay.from_estimator(model, X, ax=ax)

plt.scatter(X[:,0], X[:,1], c=y, edgecolors=["k"])
plt.xlim(0,1)
plt.ylim(0,1)
plt.title("Voronoi Diagram");
_images/24ea9d29b486ae9b7539f94c0254e9cb52d94b8b3f7873385fc439f7901ac13c.png

Test on Iris Dataset#

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris_ds = load_iris()

# use petal length and petal width as features
X = iris_ds["data"][:,[2,3]]
y = iris_ds["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=19)
from sklearn.metrics import accuracy_score

model = KNN(k=1)
model.fit(X_train, y_train)

y_hat = model.predict(X_test)

accuracy_score(y_test, y_hat)
1.0
import matplotlib.pyplot as plt
from sklearn.inspection import DecisionBoundaryDisplay

fig, ax = plt.subplots()

display = DecisionBoundaryDisplay.from_estimator(model, X_test, ax=ax)

ax.scatter(X_train[:,0], X_train[:,1], c=y_train, edgecolors="k")
plt.show()
_images/5839f9c8cf6701867abdc4456c79d9c5500050b5e3c705d68182350de54a03e7.png

Curse of Dimensionality#

from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

scores = []
for d in range(4, 200):
    X, y = make_classification(n_samples=100, n_classes=2, n_features=d, n_informative=2, n_redundant=0, n_repeated=0, random_state=19)
    # model = KNeighborsClassifier(n_neighbors=5)
    model = KNN(k=5)
    score = cross_val_score(model, X, y)
    scores.append(score)

scores = np.stack(scores)
mu = scores.mean(axis=1)
sigma = scores.std(axis=1)

plt.fill_between(x=np.arange(len(mu)), y1=mu-sigma, y2=mu+sigma, alpha=0.2)
plt.plot(scores.mean(axis=1))
plt.xlabel("Dimensions")
plt.ylabel("Accuracy");
_images/589f5d26b5ad72dc507018a03db238064184e05cb53f571d644a58560bb63b26.png

References#