模型定义
我们使用 CNN 和 MLP 来定义模型:
import torch.nn as nnclass Model(nn.Module):def __init__(self):"""定义模型结构输入维度为 1 * 28 * 28 (C, H, W)"""super(Model, self).__init__()# 卷积层 1self.conv1 = nn.Sequential(# 二维卷积层,输入通道数为 1,输出通道数为 16,卷积核大小为 5,填充为 2nn.Conv2d(1, 16, kernel_size=5, padding=2),# ReLU 激活函数nn.ReLU(),# 最大池化层,池化窗口大小为 2nn.MaxPool2d(kernel_size=2)# 输出维度为 16 * 14 * 14 (C, H/2, W/2))# 卷积层 2self.conv2 = nn.Sequential(# 二维卷积层,输入通道数为 16,输出通道数为 32,卷积核大小为 5,填充为 2nn.Conv2d(16, 32, kernel_size=5, padding=2),# ReLU 激活函数nn.ReLU(),# 最大池化层,池化窗口大小为 2nn.MaxPool2d(kernel_size=2)# 输出维度为 32 * 7 * 7 (C, H/4, W/4))# 全连接层,输入维度为 32 * 7 * 7,输出维度为 10self.fc = nn.Linear(32 * 7 * 7, 10)def forward(self, x):"""前向传播函数,由 torch 自动调用"""x = self.conv1(x)x = self.conv2(x)x = x.reshape(x.size(0), -1)x = self.fc(x)return x
训练和测试函数
import torchdef train(model, train_loader, criterion, optimizer, device):# 设置模型为训练模式model.train()total_loss = 0correct = 0total = 0for batch_idx, (data, target) in enumerate(train_loader):data, target = data.to(device), target.to(device)# 前向传播output = model(data) # 输出维度为 (batch_size, 10)# 计算损失loss = criterion(output, target)# 计算预测结果_, predicted = output.max(1)# 反向传播和优化optimizer.zero_grad() # 清空梯度loss.backward() # 反向传播optimizer.step() # 更新参数# 统计total_loss += loss.item()total += target.size(0)correct += predicted.eq(target).sum().item()# 打印进度if (batch_idx + 1) % 100 == 0: # 每 100 个 batch 打印一次print(f'Batch: {batch_idx + 1}/{len(train_loader)}, 'f'Loss: {loss.item():.4f}, 'f'Accuracy: {100. * correct / total:.2f}%')# 记录训练数据writer.add_scalar('Training Loss/Step',loss.item(),epoch * len(train_loader) + batch_idx)# 计算平均损失和准确率avg_loss = total_loss / len(train_loader)accuracy = 100. * correct / total# 计算平均损失和准确率return avg_loss, accuracydef test(model, test_loader, criterion, device):# 设置模型为评估模式model.eval()total_loss = 0correct = 0total = 0# 不计算梯度with torch.no_grad():for data, target in test_loader:data, target = data.to(device), target.to(device)# 前向传播output = model(data) # 输出维度为 (batch_size, 10)# 计算预测结果_, predicted = output.max(1) # 从维度为 1 的维度上取最大值# 计算损失loss = criterion(output, target)# 统计total_loss += loss.item()total += target.size(0)correct += predicted.eq(target).sum().item()# 计算平均损失和准确率avg_loss = total_loss / len(test_loader)accuracy = 100. * correct / totalreturn avg_loss, accuracy
主程序
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from model import Model
from train import train, test# 定义超参数
BATCH_SIZE = 64
EPOCHS = 10
LEARNING_RATE = 0.001def load_data():"""加载数据"""# 定义数据预处理transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,), (0.3081,))])# 加载训练集和测试集train_dataset = torchvision.datasets.MNIST(root='./data', train=True,transform=transform,download=True)test_dataset = torchvision.datasets.MNIST(root='./data',train=False,transform=transform,download=True)# 创建数据加载器train_loader = DataLoader(dataset=train_dataset,batch_size=BATCH_SIZE,shuffle=True)test_loader = DataLoader(dataset=test_dataset,batch_size=BATCH_SIZE,shuffle=False)return train_loader, test_loaderdef main():device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')print(f"Using device: {device}")train_loader, test_loader = load_data()# 定义模型model = Model().to(device)# 定义损失函数criterion = nn.CrossEntropyLoss()# 定义优化器optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)best_accuracy = 0for epoch in range(EPOCHS):print(f'\nEpoch: {epoch + 1}/{EPOCHS}')# 训练阶段train_loss, train_acc = train(model, train_loader, criterion, optimizer, device, epoch, writer)print(f'Training - Average Loss: {train_loss:.4f}, Accuracy: {train_acc:.2f}%')# 测试阶段test_loss, test_acc = test(model, test_loader, criterion, device, epoch, writer)print(f'Testing - Average Loss: {test_loss:.4f}, Accuracy: {test_acc:.2f}%')# 保存最佳模型if test_acc > best_accuracy:best_accuracy = test_acctorch.save(model.state_dict(), 'mnist_model.pth')print(f'\nBest Test Accuracy: {best_accuracy:.2f}%')if __name__ == '__main__':main()
TensorBoard 可视化
安装依赖:
pip install tensorboard torch_tb_profiler
修改程序,写入训练日志:
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter(f'runs/mnist_{timestamp}')sample_images, _ = next(iter(train_loader))
writer.add_graph(model, sample_images.to(device))writer.add_scalar('Testing Loss/Epoch', avg_loss, epoch)
writer.add_scalar('Testing Accuracy/Epoch', accuracy, epoch)if epoch == 0:images, labels = next(iter(test_loader))img_grid = torchvision.utils.make_grid(images[:25])writer.add_image('mnist_images', img_grid)
在 VS Code 中,可以使用下面的命令启动 TensorBoard:
> Python: Launch TensorBoard