Stage 03 — Neural Networks: Solutions

Worked solutions for Stage 3.

Dependencies: torch, torchvision, numpy, matplotlib.

Manual backprop through 2-layer MLP

For a 2-layer MLP y = W₂ relu(W₁ x), derive ∂L/∂W₁ for L = (y − target)².

Let:

z₁ = W₁ x (shape: hidden)
h₁ = relu(z₁)
y = W₂ h₁ (scalar for this exercise)
L = (y − t)²

Chain rule:

∂L/∂y     = 2(y − t)
∂y/∂h₁    = W₂                            # row vector
∂h₁/∂z₁   = diag(1[z₁ > 0])               # element-wise
∂z₁/∂W₁   = x  (with appropriate shapes)

∂L/∂W₁ = (W₂ᵀ · 2(y − t) · 1[z₁ > 0]) ⊗ xᵀ

In code, where δ_z1 is the gradient w.r.t. z₁:

import torch

torch.manual_seed(0)
x = torch.randn(3)                         # (3,)
W1 = torch.randn(4, 3, requires_grad=True) # (hidden=4, in=3)
W2 = torch.randn(4, requires_grad=True)    # (4,) for scalar output
target = torch.tensor(2.0)

# forward
z1 = W1 @ x
h1 = torch.relu(z1)
y = W2 @ h1
loss = (y - target) ** 2
loss.backward()

# manual
dL_dy = 2 * (y - target)              # scalar
dL_dh1 = W2 * dL_dy                   # (4,)
dL_dz1 = dL_dh1 * (z1 > 0).float()    # (4,)
dL_dW1_manual = torch.outer(dL_dz1, x)   # (4, 3)

print("torch:", W1.grad)
print("manual:", dL_dW1_manual)
print("close?", torch.allclose(W1.grad, dL_dW1_manual.detach()))

ReLU’s “derivative” at 0 is conventionally 0 in PyTorch; (z1 > 0) matches.

MNIST MLP from raw PyTorch

Train an MLP on MNIST in raw PyTorch (no nn.Sequential). Hit >97% test accuracy.

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

device = "cuda" if torch.cuda.is_available() else "cpu"

class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(784, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 10)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

tfm = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
train_ds = datasets.MNIST(".", train=True, download=True, transform=tfm)
test_ds = datasets.MNIST(".", train=False, download=True, transform=tfm)
train_dl = DataLoader(train_ds, batch_size=128, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=512)

model = MLP().to(device)
opt = torch.optim.AdamW(model.parameters(), lr=1e-3)

for epoch in range(5):
    model.train()
    for x, y in train_dl:
        x, y = x.to(device), y.to(device)
        loss = F.cross_entropy(model(x), y)
        opt.zero_grad(); loss.backward(); opt.step()

    # eval
    model.eval()
    correct = 0; total = 0
    with torch.no_grad():
        for x, y in test_dl:
            x, y = x.to(device), y.to(device)
            pred = model(x).argmax(dim=1)
            correct += (pred == y).sum().item()
            total += y.size(0)
    print(f"epoch {epoch+1} test acc {correct/total:.4f}")

You should see 97–98% test accuracy by epoch 5. Add dropout(0.3) between layers and watch generalization improve slightly. Add BatchNorm1d(256) before each ReLU and watch training loss drop faster.

Vanishing gradients live

Build a 30-layer MLP with sigmoid activations. Print gradient norms per layer. Watch them collapse.

import torch
import torch.nn as nn
import torch.nn.functional as F

class DeepSigmoid(nn.Module):
    def __init__(self, depth=30, width=64):
        super().__init__()
        self.layers = nn.ModuleList(
            [nn.Linear(width, width) for _ in range(depth)]
        )
        self.head = nn.Linear(width, 10)

    def forward(self, x):
        for l in self.layers:
            x = torch.sigmoid(l(x))
        return self.head(x)

model = DeepSigmoid()
x = torch.randn(8, 64)
y = torch.randint(0, 10, (8,))
loss = F.cross_entropy(model(x), y)
loss.backward()

for i, layer in enumerate(model.layers):
    g = layer.weight.grad.norm().item()
    print(f"layer {i:2d} grad norm: {g:.3e}")

You’ll see something like:

layer  0 grad norm: 1.2e-15
layer  1 grad norm: 8.4e-15
...
layer 28 grad norm: 4.7e-04
layer 29 grad norm: 9.1e-03

Early layers’ gradients are ~10⁻¹⁵ — effectively zero. The model can’t learn early features.

Fix: ReLU + residual connections

class DeepResNet(nn.Module):
    def __init__(self, depth=30, width=64):
        super().__init__()
        self.layers = nn.ModuleList(
            [nn.Linear(width, width) for _ in range(depth)]
        )
        self.norms = nn.ModuleList(
            [nn.LayerNorm(width) for _ in range(depth)]
        )
        self.head = nn.Linear(width, 10)

    def forward(self, x):
        for l, n in zip(self.layers, self.norms):
            x = x + F.relu(l(n(x)))   # residual + ReLU + LayerNorm
        return self.head(x)

model = DeepResNet()
loss = F.cross_entropy(model(x), y)
loss.backward()

for i, layer in enumerate(model.layers):
    g = layer.weight.grad.norm().item()
    print(f"layer {i:2d} grad norm: {g:.3e}")

Now gradient norms are uniform across depth — typically 10⁻³ to 10⁻². The model can learn. This is why every modern transformer has residuals + LayerNorm + ReLU-family activations.

Param count for an MLP

Compute parameter count for an MLP with shapes [784, 512, 256, 10].

def linear_params(in_dim, out_dim, bias=True):
    return in_dim * out_dim + (out_dim if bias else 0)

shapes = [784, 512, 256, 10]
total = 0
for a, b in zip(shapes, shapes[1:]):
    p = linear_params(a, b)
    print(f"Linear({a}, {b}): {p:,}")
    total += p
print(f"Total: {total:,}")

Output:

Linear(784, 512): 401,920
Linear(512, 256): 131,328
Linear(256, 10): 2,570
Total: 535,818

Tiny by modern standards. A 70B-parameter model is ~130,000× larger.

MLP without non-linearity

Replace F.relu with identity. Train. Observe.

class LinearMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(784, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 10)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.fc2(x)
        return self.fc3(x)

Train on MNIST with the same loop. You’ll get ~92% test accuracy — not bad! But you’ve effectively trained a single linear classifier y = (W₃ W₂ W₁) x + b. The “depth” was wasted.

To verify: a single-layer linear model y = W x + b (i.e. logistic regression on raw pixels) hits about the same accuracy. Stack of linear ops collapses to one linear op.

CNN on MNIST

class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x)); x = F.max_pool2d(x, 2)
        x = F.relu(self.conv2(x)); x = F.max_pool2d(x, 2)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

# Train as before; should hit >99% test accuracy.

The CNN beats the MLP by ~2pp on MNIST. The inductive bias (translation equivariance, locality) matches the data. On a non-image task, an MLP would be the right starting point.