作为计算机视觉领域的基石,卷积神经网络(CNN)在过去十年中彻底改变了图像处理的方式。今天我将带大家手动实现三大经典CNN模型:AlexNet、ResNet和VGG。不同于直接调用现成模型,我们将从底层开始构建,深入理解每个卷积层、池化层的设计原理和实现细节。
在深度学习框架高度集成的今天,我们很容易陷入"调包侠"的陷阱——只会调用现成API而不理解底层原理。手动实现经典CNN模型的价值在于:
下面我们以PyTorch为例,从最简单的AlexNet开始,逐步实现更复杂的ResNet和VGG。
AlexNet是2012年ImageNet竞赛冠军,开启了深度学习在计算机视觉领域的新纪元。其核心结构包含:
输入尺寸为224×224的RGB图像,输出1000类的分类结果。
python复制import torch
import torch.nn as nn
import torchvision.models as models
# 官方AlexNet参考
alexnet = models.alexnet()
print(alexnet)
class MyAlexNet(nn.Module):
def __init__(self):
super(MyAlexNet, self).__init__()
self.relu = nn.ReLU()
self.drop = nn.Dropout(0.5)
# 卷积层定义
self.conv1 = nn.Conv2d(3, 64, 11, 4, padding=2)
self.pool1 = nn.MaxPool2d(3, stride=2)
self.conv2 = nn.Conv2d(64, 192, 5, 1, 2)
self.pool2 = nn.MaxPool2d(3, stride=2)
self.conv3 = nn.Conv2d(192, 384, 3, 1, 1)
self.conv4 = nn.Conv2d(384, 256, 3, 1, 1)
self.conv5 = nn.Conv2d(256, 256, 3, 1, 1)
self.pool3 = nn.MaxPool2d(3, stride=2)
self.adapool = nn.AdaptiveAvgPool2d(6)
# 全连接层
self.fc1 = nn.Linear(9216, 4096)
self.fc2 = nn.Linear(4096, 4096)
self.fc3 = nn.Linear(4096, 1000)
def forward(self, x):
x = self.conv1(x)
x = self.relu(x)
x = self.pool1(x)
x = self.conv2(x)
x = self.relu(x)
x = self.pool2(x)
x = self.conv3(x)
x = self.relu(x)
print("Conv3输出尺寸:", x.size())
x = self.conv4(x)
x = self.relu(x)
print("Conv4输出尺寸:", x.size())
x = self.conv5(x)
x = self.relu(x)
x = self.pool3(x)
print("Pool3输出尺寸:", x.size())
x = self.adapool(x)
x = x.view(x.size()[0], -1)
x = self.fc1(x)
x = self.relu(x)
x = self.drop(x)
x = self.fc2(x)
x = self.relu(x)
x = self.drop(x)
x = self.fc3(x)
return x
卷积层参数设计:
维度变化跟踪:
参数量统计:
python复制def get_parameter_number(model):
total_num = sum(p.numel() for p in model.parameters())
trainable_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
return {'Total': total_num, 'Trainable': trainable_num}
model = MyAlexNet()
print(get_parameter_number(model))
输出结果应与官方AlexNet一致(约6100万参数),验证了实现的正确性。
对于教学演示或快速验证,可以去掉Dropout和非必要的ReLU,聚焦核心结构:
python复制class SimpleAlexNet(nn.Module):
def __init__(self):
super(SimpleAlexNet, self).__init__()
self.conv1 = nn.Conv2d(3, 64, 11, 4, padding=2)
self.pool1 = nn.MaxPool2d(3, 2)
self.conv2 = nn.Conv2d(64, 192, 5, 1, padding=2)
self.pool2 = nn.MaxPool2d(3, 2)
self.conv3 = nn.Conv2d(192, 384, 3, 1, 1)
self.conv4 = nn.Conv2d(384, 256, 3, 1, 1)
self.conv5 = nn.Conv2d(256, 256, 3, 1, 1)
self.pool3 = nn.MaxPool2d(3, 2)
self.adapool = nn.AdaptiveAvgPool2d(6)
self.fc1 = nn.Linear(9216, 4096)
self.fc2 = nn.Linear(4096, 4096)
self.fc3 = nn.Linear(4096, 1000)
def forward(self, x):
x = self.pool1(self.conv1(x))
x = self.pool2(self.conv2(x))
x = self.conv3(x)
x = self.conv4(x)
x = self.pool3(self.conv5(x))
x = self.adapool(x)
x = x.view(x.size()[0], -1)
x = self.fc3(self.fc2(self.fc1(x)))
return x
ResNet通过引入残差连接(residual connection)解决了深层网络的梯度消失问题,其主要特点包括:
python复制class ResidualBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1):
super(ResidualBlock, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
# 捷径分支:当输入输出维度不匹配时使用1×1卷积调整
self.shortcut = nn.Sequential()
if stride != 1 or in_channels != out_channels:
self.shortcut = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=1,
stride=stride, bias=False),
nn.BatchNorm2d(out_channels)
)
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out += self.shortcut(residual) # 残差连接
out = self.relu(out)
return out
python复制class MyResNet18(nn.Module):
def __init__(self, num_classes=1000):
super(MyResNet18, self).__init__()
self.in_channels = 64
# 初始卷积层
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
# 残差层
self.layer1 = self._make_layer(64, 64, 2, stride=1)
self.layer2 = self._make_layer(64, 128, 2, stride=2)
self.layer3 = self._make_layer(128, 256, 2, stride=2)
self.layer4 = self._make_layer(256, 512, 2, stride=2)
# 分类头
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512, num_classes)
def _make_layer(self, in_channels, out_channels, blocks, stride):
layers = []
layers.append(ResidualBlock(in_channels, out_channels, stride))
for _ in range(1, blocks):
layers.append(ResidualBlock(out_channels, out_channels))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.fc(x)
return x
通过打印各层输出尺寸,验证网络设计的正确性:
python复制model = MyResNet18()
x = torch.randn(1, 3, 224, 224)
out = model(x)
print(out.shape) # 应输出torch.Size([1, 1000])
VGG的核心设计理念是:
python复制class VGGBlock(nn.Module):
def __init__(self, in_channels, out_channels, num_convs):
super(VGGBlock, self).__init__()
layers = []
for _ in range(num_convs):
layers.append(nn.Conv2d(in_channels, out_channels,
kernel_size=3, padding=1))
layers.append(nn.ReLU(inplace=True))
in_channels = out_channels
layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
self.block = nn.Sequential(*layers)
def forward(self, x):
return self.block(x)
python复制class MyVGG13(nn.Module):
def __init__(self, num_classes=1000):
super(MyVGG13, self).__init__()
# 特征提取部分
self.features = nn.Sequential(
VGGBlock(3, 64, 2),
VGGBlock(64, 128, 2),
VGGBlock(128, 256, 2),
VGGBlock(256, 512, 2),
VGGBlock(512, 512, 2)
)
# 分类头
self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
self.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, num_classes)
)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.classifier(x)
return x
python复制vgg = MyVGG13()
print("自定义VGG13参数量:", get_parameter_number(vgg))
official_vgg = models.vgg13()
print("官方VGG13参数量:", get_parameter_number(official_vgg))
python复制# Sigmoid示例
sigmoid = nn.Sigmoid()
input = torch.randn(4)
output = sigmoid(input)
print("Sigmoid输出:", output)
# Softmax示例
softmax = nn.Softmax(dim=1)
input = torch.randn(4, 5)
output = softmax(input)
print("Softmax输出:", output)
| 特性 | ReLU | Sigmoid | Softmax |
|---|---|---|---|
| 输出范围 | [0, +∞) | (0, 1) | (0, 1)且和为1 |
| 适用场景 | 隐藏层 | 二分类输出层 | 多分类输出层 |
| 梯度特性 | 正区间无衰减 | 最大梯度0.25 | 依赖输入分布 |
| 计算复杂度 | O(1) | O(1) | O(n) |
| 死亡神经元问题 | 可能存在 | 无 | 无 |
隐藏层首选ReLU:
输出层选择:
特殊情况:
对每个自定义模型,都应进行前向传播测试:
python复制def test_forward_pass(model, input_shape=(1, 3, 224, 224)):
model.eval()
with torch.no_grad():
dummy_input = torch.randn(input_shape)
output = model(dummy_input)
print(f"输入形状: {input_shape}")
print(f"输出形状: {output.shape}")
return output.shape
# 测试AlexNet
test_forward_pass(MyAlexNet())
# 测试ResNet18
test_forward_pass(MyResNet18())
# 测试VGG13
test_forward_pass(MyVGG13())
除了整体参数量,还应关注各层参数分布:
python复制def print_layer_params(model):
for name, param in model.named_parameters():
if param.requires_grad:
print(f"{name}: {param.numel()}参数")
print_layer_params(MyResNet18())
维度不匹配错误:
训练不收敛:
过拟合:
python复制optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
python复制self.conv = nn.Conv2d(in_c, out_c, 3)
self.bn = nn.BatchNorm2d(out_c)
self.relu = nn.ReLU()
python复制from torchvision import transforms
train_transform = transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
python复制# 动态量化
quantized_model = torch.quantization.quantize_dynamic(
model, {nn.Linear}, dtype=torch.qint8
)
# 静态量化
model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
torch.quantization.prepare(model, inplace=True)
# 校准代码...
torch.quantization.convert(model, inplace=True)
python复制dummy_input = torch.randn(1, 3, 224, 224)
torch.onnx.export(model, dummy_input, "model.onnx",
input_names=["input"], output_names=["output"],
dynamic_axes={"input": {0: "batch_size"},
"output": {0: "batch_size"}})
python复制scaler = torch.cuda.amp.GradScaler()
with torch.cuda.amp.autocast():
outputs = model(inputs)
loss = criterion(outputs, labels)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
python复制for i, (inputs, labels) in enumerate(train_loader):
outputs = model(inputs)
loss = criterion(outputs, labels)
loss = loss / accumulation_steps
loss.backward()
if (i+1) % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()
python复制parameters_to_prune = (
(model.conv1, 'weight'),
(model.fc3, 'weight'),
)
prune.global_unstructured(
parameters_to_prune,
pruning_method=prune.L1Unstructured,
amount=0.2,
)
python复制# 加载预训练模型
model = models.resnet18(pretrained=True)
# 替换最后一层
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 10) # 假设我们的任务有10类
# 只训练最后一层
for param in model.parameters():
param.requires_grad = False
for param in model.fc.parameters():
param.requires_grad = True
python复制# 数据加载
dataset = datasets.ImageFolder(root='data/train', transform=train_transform)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)
# 训练循环
for epoch in range(num_epochs):
model.train()
for inputs, labels in dataloader:
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
python复制# 多个模型预测结果平均
def ensemble_predict(models, input):
with torch.no_grad():
outputs = [model(input) for model in models]
avg_output = torch.mean(torch.stack(outputs), dim=0)
return avg_output
python复制from torchviz import make_dot
x = torch.randn(1, 3, 224, 224)
y = model(x)
make_dot(y, params=dict(model.named_parameters())).render("model", format="png")
通过手动实现AlexNet、ResNet和VGG这三个经典CNN模型,我们深入理解了卷积神经网络的设计原理和实现细节。在实际项目中,我有以下几点建议:
手动实现经典模型是深入理解深度学习的最佳途径之一。虽然现代框架提供了现成的实现,但只有亲自动手构建,才能真正掌握模型的设计精髓,为后续的模型改进和创新打下坚实基础。