Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

07: Multilayer Perceptrons (MLPs) and Convolutional Neural Networks (CNNs)

Towards MLPs

Today we will start by trying to solve the XOR classification problem using the linear modeling tools that we have learned about in previous sessions. We will see what is and isn’t possible with linear methods and that the multilayer perceptron (MLP) is a natural abstraction of combining linear models.

But first, have a look at the data.

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(0)

std = 0.01
X = np.concatenate([
    np.random.multivariate_normal([1, 1], np.eye(2)*std, 100),
    np.random.multivariate_normal([1, 0], np.eye(2)*std, 100),
    np.random.multivariate_normal([0, 0], np.eye(2)*std, 100),
    np.random.multivariate_normal([0, 1], np.eye(2)*std, 100),
])
y = np.concatenate([
    np.full(100, 1),
    np.full(100, 0),
    np.full(100, 1),
    np.full(100, 0)
])
Source
def plot_XOR_data(xs, ys, ax):
    ax.scatter(xs[y.astype(bool), 0], xs[y.astype(bool), 1], color="blue", label="Class 1")
    ax.scatter(xs[np.invert(y.astype(bool)), 0], xs[np.invert(y.astype(bool)), 1], color="red",  label="Class 0")
    ax.set_xlabel(r"$x_1$")
    ax.set_ylabel(r"$x_2$")
    ax.legend()


def get_grid_points(grid_min, grid_max, grid_size):
    grid_vals = np.linspace(grid_min, grid_max, grid_size)
    xx, yy = np.meshgrid(grid_vals, grid_vals)
    return np.column_stack([xx.ravel(), yy.ravel()]), xx, yy


def plot_predictions_grid_regression(fig, ax, xx, yy, grid_min, grid_max, preds, levels=[0.5]):
    preds = np.reshape(preds, (len(xx), len(yy)))

    im = ax.imshow(preds[::-1, :], extent=(grid_min, grid_max, grid_min, grid_max))
    contour = ax.contour(xx, yy, preds, levels=levels, colors="black")
    ax.clabel(contour)

    cbar = fig.colorbar(im, ax=ax)
    cbar.set_label("predictions")


def plot_predictions_grid_classification(fig, ax, xx, yy, preds):
    preds = np.reshape(preds, (len(xx), len(yy)))
    
    cf = ax.contourf(xx, yy, preds, alpha=0.4, cmap=plt.cm.RdBu)
    
    cbar = fig.colorbar(cf, ax=ax)
    cbar.set_ticks([0, 1])
    cbar.set_label("predictions")


fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(111)
plot_XOR_data(X, y, ax)
ax.set_title("XOR data")
plt.show()

We have seen the XOR problem in the lecture, where the famous result was stated, that this simple problem cannot be solved by a linear model alone.

Instead of using just one linear model, let’s try to solve it by chaining linear models. For starters, we have the following two linear models, that each correctly classify one blob of class 1 and class 0.

y^1=1+(1)x1+(1)x2=1x1x2\hat{y}_1 = 1 + (-1) \cdot x_1 + (-1) \cdot x_2 = 1 - x_1 - x_2
y^2=1+1x1+1x2=1+x1+x2\hat{y}_2 = -1 + 1 \cdot x_1 + 1 \cdot x_2 = -1 + x_1 + x_2
Source
from sklearn.linear_model import LinearRegression


def get_linear_models_class_1():
    lr1 = LinearRegression()
    lr1.intercept_ = 1
    lr1.coef_ = np.array([-1, -1])
    
    lr2 = LinearRegression()
    lr2.intercept_ = -1
    lr2.coef_ = np.array([1, 1])

    return lr1, lr2


def get_linear_models_binary():
    lr1 = LinearRegression()
    lr1.intercept_ = 0
    lr1.coef_ = np.array([0, 1])
    
    lr2 = LinearRegression()
    lr2.intercept_ = 0
    lr2.coef_ = np.array([1, 0])

    return lr1, lr2
lr1, lr2 = get_linear_models_class_1()
# lr1, lr2 = get_linear_models_binary()  # in case you want to try something else

# print model parameters
print("Parameters of the first model:", f"beta^T = [{lr1.intercept_:.3f}, {lr1.coef_[0]:.3f}, {lr1.coef_[1]:.3f}]")
print("Parameters of the second model:", f"beta^T = [{lr2.intercept_:.3f}, {lr2.coef_[0]:.3f}, {lr2.coef_[1]:.3f}]")
Source
# get plot predictions
plot_min = -0.5
plot_max = 1.5
grid_points, xx, yy = get_grid_points(plot_min, plot_max, 100)

lr1_preds_grid = lr1.predict(grid_points)
lr2_preds_grid = lr2.predict(grid_points)


# plot
fig = plt.figure(figsize=(12, 5))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)

plot_predictions_grid_regression(fig, ax1, xx, yy, plot_min, plot_max, lr1_preds_grid)
plot_XOR_data(X, y, ax1)
ax1.set_title("Linear model 1")

plot_predictions_grid_regression(fig, ax2, xx, yy, plot_min, plot_max, lr2_preds_grid)
plot_XOR_data(X, y, ax2)
ax2.set_title("Linear model 2")

plt.tight_layout()
plt.show()

These two linear models both solve a subproblem of the overall problem, and it would be awesome if we could just combine them to form the overall solution.

The idea is simple: When predictions from model 1 are large, then it’s probably class 1 and when predictions from model 2 are large, then it’s probably class 2. So let’s try to train another linear model on the outputs of model 1 and 2 to solve the XOR problem.

lr1_preds = lr1.predict(X)
lr2_preds = lr2.predict(X)
X_preds = np.stack([lr1_preds, lr2_preds], axis=-1)  # save predictions from models 1 and 2 for use as input in model 3

lr3 = LinearRegression().fit(X_preds, y)  # fit model 3 on predictions from models 1 and 2

# Print model parameters
print("Parameters of the second layer:", f"beta^T = [{lr3.intercept_:.3f}, {lr3.coef_[0]:.3f}, {lr3.coef_[1]:.3f}]")
Source
# plot
lr3_preds_grid = lr3.predict(np.stack([lr1_preds_grid, lr2_preds_grid], axis=-1))

fig = plt.figure(figsize=(6, 5))
ax1 = fig.add_subplot(111)

plot_predictions_grid_regression(fig, ax1, xx, yy, plot_min, plot_max, lr3_preds_grid)
plot_XOR_data(X, y, ax1)
ax1.set_title("Combined linear model")

plt.tight_layout()
plt.show()

Hm, this doesn’t look right at all.

To find out why this fails, we will have to look at some equations. Firstly we formalize our setup as:

a(0)=[x1x2],bi(1)=β^0,i,Wi(1)=[β^1,iβ^2,i],i{1,2}\mathbf{a}^{(0)} = \begin{bmatrix} x_1 \\ x_2 \end{bmatrix}, \qquad b^{(1)}_i = \hat{\beta}_{0, i}, \qquad W^{(1)}_i = \begin{bmatrix} \hat{\beta}_{1, i} & \hat{\beta}_{2, i} \end{bmatrix}, \qquad i \in \{1, 2\}
a1(1)=W1(1)a(0)+b1(1),a2(1)=W2(1)a(0)+b2(1)a^{(1)}_1 = W^{(1)}_1 \mathbf{a}^{(0)} + b^{(1)}_1, \qquad a^{(1)}_2 = W^{(1)}_2 \mathbf{a}^{(0)} + b^{(1)}_2
W(1)=[W1(1)W2(1)],b(1)=[b1(1)b2(1)]W^{(1)} = \begin{bmatrix} W^{(1)}_1 \\ W^{(1)}_2 \end{bmatrix}, \qquad \mathbf{b}^{(1)} = \begin{bmatrix} b^{(1)}_1 \\ b^{(1)}_2 \end{bmatrix}
a(1)=W(1)a(0)+b(1)=[a1(1)a2(1)]\mathbf{a}^{(1)} = W^{(1)} \mathbf{a}^{(0)} + \mathbf{b}^{(1)} = \begin{bmatrix} a^{(1)}_1 \\ a^{(1)}_2 \end{bmatrix}
a(2)=W(2)a(1)+b(2)\begin{align*} \mathbf{a}^{(2)} &= W^{(2)} \mathbf{a}^{(1)} + \mathbf{b}^{(2)} \end{align*}

As we can see, stacking linear models doesn’t really work out as we wished. The result is again just a linear function! (But you already knew that from the lecture.)

Let’s go back to our two linear models from the start. They are supposed to classify the points according to the XOR function, so maybe we should treat the classification step as a part of them.

The following plots reflect this point of view, by showing the final predicted classes from each of these models.

def threshold(x,  t=0.5):
    return x > t

lr1_preds_grid_classified = threshold(lr1.predict(grid_points))
lr2_preds_grid_classified = threshold(lr2.predict(grid_points))
Source
# plot
fig = plt.figure(figsize=(12, 5))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)

plot_predictions_grid_classification(fig, ax1, xx, yy, lr1_preds_grid_classified)
plot_XOR_data(X, y, ax1)
ax1.set_title("Linear Classifier 1")

plot_predictions_grid_classification(fig, ax2, xx, yy, lr2_preds_grid_classified)
plot_XOR_data(X, y, ax2)
ax2.set_title("Linear Classifier 2")

plt.tight_layout()
plt.show()

These class outputs are not the result of a purely linear function anymore, as they include the thresholding step. So the argument from before, that chaining another model after these gives us still just a linear function overall, does not hold anymore.

Let’s try again to combine these predictions using the same kind of model.

lr1_preds_classified = threshold(lr1.predict(X))
lr2_preds_classified = threshold(lr2.predict(X))
X_preds_classified = np.stack([lr1_preds_classified, lr2_preds_classified], axis=-1)

lr3_classified = LinearRegression().fit(X_preds_classified, y)

# Print model parameters
print("Parameters of the second layer:", f"beta^T = [{lr3_classified.intercept_:.3f}, {lr3_classified.coef_[0]:.3f}, {lr3_classified.coef_[1]:.3f}]")
Source
# plot
lr3_preds_grid_classified = threshold(lr3_classified.predict(np.stack([lr1_preds_grid_classified, lr2_preds_grid_classified], axis=-1)))

fig = plt.figure(figsize=(6, 5))
ax1 = fig.add_subplot(111)

plot_predictions_grid_classification(fig, ax1, xx, yy, lr3_preds_grid_classified)
plot_XOR_data(X, y, ax1)
ax1.set_title("Combined linear classifier")

plt.tight_layout()
plt.show()

It worked!

Just one problem: We didn’t train the whole thing. During training, we would have to somehow make sure, that the first two linear models provide sensible features for the third model to use.

We know from the lecture, that MLPs are more complex than linear models and that they can be trained as a whole. So let’s have a look at how to interpret our current setup as an MLP.

We can see that our combined linear classifier model actually matches the structure of an MLP, so we have just constructed our first MLP model.

The lecture said that to train MLPs, usually gradient descent is used. Let’s have a look at what that means and how this affects our model.

Source
def plot_activation(f, f_label, title, start, end, samples, ax):
    samples_1d = np.linspace(start, end, samples)
    
    ax.plot(samples_1d, f(samples_1d), label=f_label)
    ax.set_title(title)
    ax.set_xlabel(r"$W^{(l)} \mathbf{a}^{(l-1)} + \mathbf{b}^{(l)}$")
    ax.set_ylabel(r"$a^{(l)}$")
    ax.legend()


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


fig = plt.figure(figsize=(12, 5))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)

plot_activation(threshold, r"$W^{(l)} \mathbf{a}^{(l-1)} + \mathbf{b}^{(l)} > 0.5$", "Threshold function as activation function", 0, 1, 1000, ax1)
plot_activation(sigmoid, r"$\text{sigmoid}(W^{(l)} \mathbf{a}^{(l-1)} + \mathbf{b}^{(l)})$", "Sigmoid function as activation function", -10, 10, 1000, ax2)

plt.tight_layout()
plt.plot()

The next cells use logistic regression models to implement the idea of replacing the threshold function with sigmoid in our combined model / MLP.

Source
from sklearn.linear_model import LogisticRegression


def get_logistic_models_class_1(xs, ys):
    logr1 = LogisticRegression().fit(xs, ys)
    logr1.intercept_ = 0.5
    logr1.coef_ = np.array([-1, -1])
    
    logr2 = LogisticRegression().fit(xs, ys)
    logr2.intercept_ = -1.5
    logr2.coef_ = np.array([1, 1])

    return logr1, logr2


def get_logistic_models_binary(xs, ys):
    logr1 = LogisticRegression().fit(xs, ys)
    logr1.intercept_ = -0.5
    logr1.coef_ = np.array([0, 1])
    
    logr2 = LogisticRegression().fit(xs, ys)
    logr2.intercept_ = -0.5
    logr2.coef_ = np.array([1, 0])

    return logr1, logr2
logr1, logr2 = get_logistic_models_class_1(X, y)  # creates logistic regression models with the same parameters as linear models 1 and 2 before
#logr1, logr2 = get_logistic_models_binary(X, y)

logr1_preds_grid = logr1.predict_proba(grid_points)[:, 1]  # need to call predict_proba on sklearn's logistic regression models to get raw predictions (not yet thresholded)
logr2_preds_grid = logr2.predict_proba(grid_points)[:, 1]
Source
# plot
fig = plt.figure(figsize=(12, 5))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)

plot_predictions_grid_regression(fig, ax1, xx, yy, plot_min, plot_max, logr1_preds_grid)
plot_XOR_data(X, y, ax1)
ax1.set_title("Logistic regression model 1")

plot_predictions_grid_regression(fig, ax2, xx, yy, plot_min, plot_max, logr2_preds_grid)
plot_XOR_data(X, y, ax2)
ax2.set_title("Logistic regression model 2")

plt.tight_layout()
plt.show()
logr1_preds = logr1.predict_proba(X)[:, 1]
logr2_preds = logr2.predict_proba(X)[:, 1]

X_preds_log = np.stack([logr1_preds, logr2_preds], axis=-1)

logr3 = LogisticRegression().fit(X_preds_log, y)
print("Parameters of the second layer:", f"beta^T = [{logr3.intercept_[0]:.3f}, {logr3.coef_[0, 0]:.3f}, {logr3.coef_[0, 1]:.3f}]")

# for zero in-sample error
# logr3.intercept_ = np.array([-1.55])
# logr3.coef_ = np.array([[2, 2]])
Source
# plot
logr3_preds_grid = logr3.predict_proba(np.stack([logr1_preds_grid, logr2_preds_grid], axis=-1))[:, 1]

fig = plt.figure(figsize=(6, 5))
ax1 = fig.add_subplot(111)

plot_predictions_grid_regression(fig, ax1, xx, yy, plot_min, plot_max, logr3_preds_grid)
plot_XOR_data(X, y, ax1)
ax1.set_title("Combined logistic regression model")

plt.tight_layout()
plt.show()

We can observe that the result is really similar to the combined linear classifier, but the predictions are now smooth. We still have to apply the threshold function at the end to obtain the final classes, but now we have sigmoid inside the model as a smooth alternative for the thresholding.

This model architecture is now ready to be trained with gradient descent, but for that we will switch libraries. PyTorch (by Meta, as in Facebook) and the alternative TensorFlow (by Google) are the de facto standard libraries for neural networks in Python, which includes MLPs. This will also allow us to go further and implement a CNN next. We will use PyTorch.

Have a look around the code cells for this example. They come with explainers of what they are about on a high level.

1. Imports and conversion

torch brings general stuff like the Tensor object through the constructur torch.tensor and torch.no_grad, which simply disables PyTorch’s automatic gradient calculation and therefore saves resources. You can think of a Tensor in PyTorch as a multidimensional array, like numpy arrays (numpy.ndarray). Unlike numpy arrays, Pytorch Tensors can be easily moved to a GPU to accelerate operations on them, if one is available. We convert our numpy arrays X and y to Tensors here. Also we use torch.manual_seed to predetermine the sequence of random numbers that are internally generated by PyTorch, wherever it uses randomness. This way everyone of us gets the same exact results. (Watch out that in practice, there can be sources of randomness in other libraries (such as numpy) that this doesn’t control and that the random number generators in different versions of the same library (or dependencies) might yield different sequences of random numbers for the same seed.)

torch.nn has a lot of functionality that is needed when working with neural networks. We will use torch.nn.Linear and torch.nn.Sigmoid as the building blocks for our MLP. We combine them into one complete model by defining a new class XOR_MLP, that inherits from torch.nn.Module. torch.nn.MSELoss supplies us with the optimization target we are used to.

torch.optim contains implementations of various optimization algorithms. We use torch.optim.SGD (for stochastic gradient descent) here, which was presented in the lecture.

tqdm enables us to show a progress bar for the training loop.

import torch
import torch.nn as nn
import torch.optim as optim
from tqdm.auto import tqdm

torch.manual_seed(0)

# Convert data to PyTorch tensors
X_tensor = torch.tensor(X)
y_tensor = torch.tensor(y, dtype=torch.float64).reshape((len(y), 1))

print("X Tensor:", X_tensor.shape, X_tensor.dtype)
print("y Tensor:", y_tensor.shape, y_tensor.dtype)

2. Model definition

Here we define our MLP as a new class XOR_MLP. In __init__ we define which building blocks our model should consist of and in forward we define how they should be wired up to form our model function.

class XOR_MLP(nn.Module):
    def __init__(self):
        super(XOR_MLP, self).__init__()
        self.hidden = nn.Linear(2, 2)  # first layer, representing our two linear models for feature extraction
        self.output = nn.Linear(2, 1)  # second (and final) layer, representing the third linear model that combines the prediction of the first two logistic regression models
        
        self.sigmoid = nn.Sigmoid()  # activation function, like in logistic regression

        self.double()  # tell PyTorch to use parameters of dtype torch.float64, to match the inputs
    
    def forward(self, x):
        hidden_out = self.sigmoid(self.hidden(x))  # wire up the first layer with sigmoid to use it as its activation function
        output = self.sigmoid(self.output(hidden_out))  # wire up the second layer to use the first layer's predictions as input and use sigmoid as its activation function
        return output

3. Training

Here we first initialize objects for the model, the loss function and the optimization algorithm. Then we use them in a training loop that implements mini-batch gradient descent. Note that in every epoch we first shuffle the data and then update the model parameters based on only batch_size many points.

Updating the parameters consists of only a couple calls to PyTorch functions, which are very similar to the pseudocode from the lecture. The loss is first computed based on model predictions, then the gradients are computed and the gradient descent update rule θθηθLB\theta \leftarrow \theta - \eta \nabla_{\theta} L_B is applied, where η\eta is the learning rate, which we supply to the SGD optimizer as the argument lr during instantiation.

The call to optimizer.zero_grad is a pure technicality here, since PyTorch by default accumulates a buffer of past gradients, which we don’t need and therefore clear using this function.

# Initialize the model, loss function, and optimizer
model = XOR_MLP()
criterion = nn.MSELoss()  # Binary Cross Entropy Loss
optimizer = optim.SGD(model.parameters(), lr=0.5)  # SGD with learning rate 0.5

# Training loop
num_epochs = 200
batch_size = 25
for epoch in tqdm(range(num_epochs)):
    # Shuffle the data
    indices = torch.randperm(len(X_tensor))
    X_shuffled = X_tensor[indices]
    y_shuffled = y_tensor[indices]
    
    # Process in mini-batches
    for i in range(0, len(X_tensor), batch_size):
        # Get batch
        X_batch = X_shuffled[i:i+batch_size]
        y_batch = y_shuffled[i:i+batch_size]
        
        # Forward pass
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

4. Visualization of predictions

grid_points_tensor = torch.tensor(grid_points)

model.eval()  # Set to evaluation mode
with torch.no_grad():
    predictions = model(grid_points_tensor)
    predicted_classes = predictions > 0.5


fig = plt.figure(figsize=(12, 5))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)

plot_predictions_grid_regression(fig, ax1, xx, yy, plot_min, plot_max, predictions.numpy())
plot_XOR_data(X, y, ax1)
ax1.set_title("MLP predictions")

plot_predictions_grid_classification(fig, ax2, xx, yy, predicted_classes.numpy())
plot_XOR_data(X, y, ax2)
ax2.set_title("Thresholded MLP predictions")

plt.tight_layout()
plt.show()

There is an interactive version of this model set up >here<. Go over there and click start to see how the model changes during training. Feel free to play around a bit by adding layers or neurons, changing the activation function or do whatever you like with the model/data configuration.

Hands-on MNIST: MLP and CNN

Now that we have seen how to build an MLP in PyTorch, let’s step it up a notch. We will take the MNIST dataset as an example image dataset. It consists of 28x28 pixel images of handwritten digits 0-9. You will create and train your own fully connected MLP and CNN to recognize the handwritten digits in the data.

Let’s first have a look at some example images from this dataset.

Source
# Function to plot one random image per class
def plot_random_images_per_class(model, loader, device):
    if model is not None:
        model.eval()
    
    # Dictionary to store one image per class
    class_images = {}
    class_labels = {}
    
    # Get one random image for each class
    for data, target in loader:
        data, target = data.to(device), target.to(device)
        
        for i in range(len(target)):
            label = target[i].item()
            if label not in class_images:
                class_images[label] = data[i].cpu().numpy()
                class_labels[label] = target[i].item()
    
    # Plot the images
    fig, axes = plt.subplots(1, 10, figsize=(15, 3))
    axes = axes.ravel()
    
    with torch.no_grad():
        for i in range(10):
            # Get the image and true label
            img = class_images[i]
            true_label = class_labels[i]
            
            # Convert from CHW to HWC for plotting
            img = np.transpose(img, (1, 2, 0))
            
            # Normalize back to [0,1] range
            img = (img - img.min()) / (img.max() - img.min())

            if model is not None:
                # Get prediction
                input_tensor = torch.tensor(img).permute(2, 0, 1).unsqueeze(0).to(device)
                output = model(input_tensor)
                _, predicted = torch.max(output, 1)
                pred_label = predicted.item()
                axes[i].set_title(f'True: {true_label}, Pred: {pred_label}', 
                                  fontsize=10, color='blue' if true_label == pred_label else 'red')
            else:
                axes[i].set_title(f'Label: {true_label}')
            
            # Plot the image
            axes[i].imshow(img, cmap='gray')
            axes[i].axis('off')
    
    plt.suptitle('Random Image from Each MNIST Class', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# Set device to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device = {device}")

transform = transforms.Compose([
    transforms.ToTensor(),  # Converts PIL image to tensor (0-1 range), 3 channels
    transforms.Normalize(mean=0.1307, std=0.3015),  # standardize data (mean 0, std 1)
])

# Load training data
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# plot
plot_random_images_per_class(None, train_loader, device)

As you can see, the images are in grayscale, so there is no color information.

MLP

If you struggle with MLPs, there are really good videos by 3Blue1Brown >here< (on MLPs / neural networks) and >here< (on gradient descent). In case you want to watch them, please do so before or after the exercise session.

Source
from IPython.display import clear_output


def get_loss(model, criterion, loader):
    model.eval()
    total_loss = 0.0
    total_samples = 0
    
    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item() * inputs.size(0)
            total_samples += inputs.size(0)
    
    return total_loss / total_samples


def show_next_plot(epochs, epoch_numbers, epoch_train_losses, epoch_test_losses):
    fig = plt.figure(figsize=(6, 5))
    ax1 = fig.add_subplot(111)
    
    ax1.clear()
    ax1.plot(epoch_numbers, epoch_train_losses, 'b-o', label='Train Loss', markersize=4)
    ax1.plot(epoch_numbers, epoch_test_losses, 'r-o', label='Test Loss', markersize=4)
    ax1.set_title('Loss over time')
    ax1.set_xlabel('Epoch Number')
    ax1.set_ylabel('Loss (Avg)')
    ax1.set_xticks(range(epochs+1))
    ax1.set_xlim(-0.5, epochs+0.5)
    ax1.set_ylim(0, np.max([epoch_train_losses, epoch_test_losses])*1.1)
    ax1.grid(True, alpha=0.3)
    ax1.legend()
    
    # Update the figure
    plt.tight_layout()
    plt.show()


def train_live_loss(model, criterion, optimizer, epochs=10):
    print("Starting Training...")
    
    # Set up live plotting
    plt.ion()  # Turn on interactive mode
    
    # Initialize lists for tracking
    epoch_numbers = [0]
    epoch_train_losses = [get_loss(model, criterion, train_loader)]
    epoch_test_losses = [get_loss(model, criterion, test_loader)]

    show_next_plot(epochs, epoch_numbers, epoch_train_losses, epoch_test_losses)
    
    # Training loop
    for epoch in range(epochs):
        running_train_loss = 0.0
        model.train()
        
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            running_train_loss += loss.item() * inputs.size(0)
    
        # Save current epoch loss
        epoch_numbers.append(epoch + 1)
        epoch_train_losses.append(running_train_loss / len(train_dataset))
        epoch_test_losses.append(get_loss(model, criterion, test_loader))
        
        # Clear and redraw the plot
        clear_output(wait=True)
        show_next_plot(epochs, epoch_numbers, epoch_train_losses, epoch_test_losses)
    
    # Disable interactive mode
    plt.ioff()
class ImageMLP(nn.Module):
    def __init__(self):
        super().__init__()

        # TODO: Task 2
    
    def forward(self, x):
        # TODO: Task 2
        
        return x
# Initialize model
imagemlp_model = ImageMLP().to(device)  # send model weights to the GPU, if available

# Loss and optimizer
criterion = nn.CrossEntropyLoss()  # a popular choice for multi-class classification
optimizer = optim.SGD(imagemlp_model.parameters(), lr=0.001)

# Start training
train_live_loss(imagemlp_model, criterion, optimizer, epochs=10)
plot_random_images_per_class(imagemlp_model, test_loader, device)
Source
def get_model_predictions(model, test_loader, device):
    model.eval()
    all_preds = []
    all_targets = []
    
    with torch.no_grad():
        for data, target in tqdm(test_loader):
            data, target = data.to(device), target.to(device)
            output = model(data)
            _, predicted = torch.max(output, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_targets.extend(target.cpu().numpy())
    
    return np.array(all_targets), np.array(all_preds)
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

def evaluate(model, loader):
    print("Computing predictions...")
    true_labels, predicted_labels = get_model_predictions(model, test_loader, device)
    
    # Create confusion matrix
    cm = confusion_matrix(true_labels, predicted_labels)
    
    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=range(10), yticklabels=range(10))
    plt.title('Confusion Matrix for MNIST MLP')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()
    
    # Print classification report
    print("\nClassification Report:")
    print(classification_report(true_labels, predicted_labels, digits=4))


evaluate(imagemlp_model, test_loader)

CNN

There is an animated and interactive CNN visualization >here<, which can help you see how the core operations involved in CNNs operate on data. Make sure to scroll down and check out the section “Understanding Hyperparameters”, if you have a hard time understanding kernel size, stride or padding in convolutions or pooling layers.

There is also a great video by 3Blue1Brown >here< about convolution, with a lot more visualizations and a really good explanation. In case you want to watch it, please do so before or after the exercise session.

class CNN(nn.Module):
    def __init__(self):
        super().__init__()

        # TODO: Task 1
    
    def forward(self, x):
        # TODO: Task 1
        
        return x
# Initialize model
cnn_model = CNN().to(device)  # send model weights to the GPU, if available

# Loss and optimizer
criterion = nn.CrossEntropyLoss()  # a popular choice for multi-class classification
optimizer = optim.SGD(cnn_model.parameters(), lr=0.001)

# Start training
train_live_loss(cnn_model, criterion, optimizer, epochs=10)
plot_random_images_per_class(cnn_model, test_loader, device)
evaluate(cnn_model, test_loader)