Stage 03 — Neural Networks: Solutions
Worked solutions for Stage 3.
Dependencies: torch, torchvision, numpy, matplotlib.
Manual backprop through 2-layer MLP
For a 2-layer MLP
y = W₂ relu(W₁ x), derive∂L/∂W₁forL = (y − target)².
Let:
z₁ = W₁ x(shape: hidden)h₁ = relu(z₁)y = W₂ h₁(scalar for this exercise)L = (y − t)²
Chain rule:
∂L/∂y = 2(y − t)
∂y/∂h₁ = W₂ # row vector
∂h₁/∂z₁ = diag(1[z₁ > 0]) # element-wise
∂z₁/∂W₁ = x (with appropriate shapes)
∂L/∂W₁ = (W₂ᵀ · 2(y − t) · 1[z₁ > 0]) ⊗ xᵀ
In code, where δ_z1 is the gradient w.r.t. z₁:
import torch
torch.manual_seed(0)
x = torch.randn(3) # (3,)
W1 = torch.randn(4, 3, requires_grad=True) # (hidden=4, in=3)
W2 = torch.randn(4, requires_grad=True) # (4,) for scalar output
target = torch.tensor(2.0)
# forward
z1 = W1 @ x
h1 = torch.relu(z1)
y = W2 @ h1
loss = (y - target) ** 2
loss.backward()
# manual
dL_dy = 2 * (y - target) # scalar
dL_dh1 = W2 * dL_dy # (4,)
dL_dz1 = dL_dh1 * (z1 > 0).float() # (4,)
dL_dW1_manual = torch.outer(dL_dz1, x) # (4, 3)
print("torch:", W1.grad)
print("manual:", dL_dW1_manual)
print("close?", torch.allclose(W1.grad, dL_dW1_manual.detach()))
ReLU’s “derivative” at 0 is conventionally 0 in PyTorch; (z1 > 0) matches.
MNIST MLP from raw PyTorch
Train an MLP on MNIST in raw PyTorch (no
nn.Sequential). Hit >97% test accuracy.
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
device = "cuda" if torch.cuda.is_available() else "cpu"
class MLP(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(784, 256)
self.fc2 = nn.Linear(256, 256)
self.fc3 = nn.Linear(256, 10)
def forward(self, x):
x = x.view(x.size(0), -1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)
tfm = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
train_ds = datasets.MNIST(".", train=True, download=True, transform=tfm)
test_ds = datasets.MNIST(".", train=False, download=True, transform=tfm)
train_dl = DataLoader(train_ds, batch_size=128, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=512)
model = MLP().to(device)
opt = torch.optim.AdamW(model.parameters(), lr=1e-3)
for epoch in range(5):
model.train()
for x, y in train_dl:
x, y = x.to(device), y.to(device)
loss = F.cross_entropy(model(x), y)
opt.zero_grad(); loss.backward(); opt.step()
# eval
model.eval()
correct = 0; total = 0
with torch.no_grad():
for x, y in test_dl:
x, y = x.to(device), y.to(device)
pred = model(x).argmax(dim=1)
correct += (pred == y).sum().item()
total += y.size(0)
print(f"epoch {epoch+1} test acc {correct/total:.4f}")
You should see 97–98% test accuracy by epoch 5. Add dropout(0.3) between layers and watch generalization improve slightly. Add BatchNorm1d(256) before each ReLU and watch training loss drop faster.
Vanishing gradients live
Build a 30-layer MLP with sigmoid activations. Print gradient norms per layer. Watch them collapse.
import torch
import torch.nn as nn
import torch.nn.functional as F
class DeepSigmoid(nn.Module):
def __init__(self, depth=30, width=64):
super().__init__()
self.layers = nn.ModuleList(
[nn.Linear(width, width) for _ in range(depth)]
)
self.head = nn.Linear(width, 10)
def forward(self, x):
for l in self.layers:
x = torch.sigmoid(l(x))
return self.head(x)
model = DeepSigmoid()
x = torch.randn(8, 64)
y = torch.randint(0, 10, (8,))
loss = F.cross_entropy(model(x), y)
loss.backward()
for i, layer in enumerate(model.layers):
g = layer.weight.grad.norm().item()
print(f"layer {i:2d} grad norm: {g:.3e}")
You’ll see something like:
layer 0 grad norm: 1.2e-15
layer 1 grad norm: 8.4e-15
...
layer 28 grad norm: 4.7e-04
layer 29 grad norm: 9.1e-03
Early layers’ gradients are ~10⁻¹⁵ — effectively zero. The model can’t learn early features.
Fix: ReLU + residual connections
class DeepResNet(nn.Module):
def __init__(self, depth=30, width=64):
super().__init__()
self.layers = nn.ModuleList(
[nn.Linear(width, width) for _ in range(depth)]
)
self.norms = nn.ModuleList(
[nn.LayerNorm(width) for _ in range(depth)]
)
self.head = nn.Linear(width, 10)
def forward(self, x):
for l, n in zip(self.layers, self.norms):
x = x + F.relu(l(n(x))) # residual + ReLU + LayerNorm
return self.head(x)
model = DeepResNet()
loss = F.cross_entropy(model(x), y)
loss.backward()
for i, layer in enumerate(model.layers):
g = layer.weight.grad.norm().item()
print(f"layer {i:2d} grad norm: {g:.3e}")
Now gradient norms are uniform across depth — typically 10⁻³ to 10⁻². The model can learn. This is why every modern transformer has residuals + LayerNorm + ReLU-family activations.
Param count for an MLP
Compute parameter count for an MLP with shapes
[784, 512, 256, 10].
def linear_params(in_dim, out_dim, bias=True):
return in_dim * out_dim + (out_dim if bias else 0)
shapes = [784, 512, 256, 10]
total = 0
for a, b in zip(shapes, shapes[1:]):
p = linear_params(a, b)
print(f"Linear({a}, {b}): {p:,}")
total += p
print(f"Total: {total:,}")
Output:
Linear(784, 512): 401,920
Linear(512, 256): 131,328
Linear(256, 10): 2,570
Total: 535,818
Tiny by modern standards. A 70B-parameter model is ~130,000× larger.
MLP without non-linearity
Replace
F.reluwith identity. Train. Observe.
class LinearMLP(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(784, 256)
self.fc2 = nn.Linear(256, 256)
self.fc3 = nn.Linear(256, 10)
def forward(self, x):
x = x.view(x.size(0), -1)
x = self.fc1(x)
x = self.fc2(x)
return self.fc3(x)
Train on MNIST with the same loop. You’ll get ~92% test accuracy — not bad! But you’ve effectively trained a single linear classifier y = (W₃ W₂ W₁) x + b. The “depth” was wasted.
To verify: a single-layer linear model y = W x + b (i.e. logistic regression on raw pixels) hits about the same accuracy. Stack of linear ops collapses to one linear op.
CNN on MNIST
class CNN(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
self.fc1 = nn.Linear(64 * 7 * 7, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = F.relu(self.conv1(x)); x = F.max_pool2d(x, 2)
x = F.relu(self.conv2(x)); x = F.max_pool2d(x, 2)
x = x.view(x.size(0), -1)
x = F.relu(self.fc1(x))
return self.fc2(x)
# Train as before; should hit >99% test accuracy.
The CNN beats the MLP by ~2pp on MNIST. The inductive bias (translation equivariance, locality) matches the data. On a non-image task, an MLP would be the right starting point.