# 1
A CNN takes a RGB image ( channels) as input. It is passed through the following sequence of layers:
- Conv layer with filters, each of size , stride , padding same;
- Max Pooling layer with window, stride ;
- Conv layer with filters, each of size , stride , padding valid;
- Max Pooling layer with window, stride ;
- A Flatten layer;
- A final Dense (Full Connected) layer with units.
- Calculate the dimensions (height, width, number of channels) of the output after each of the first four layers. Show your work.
- What is the total number of elements after the Flatten layer?
- Calculate the number of trainable parameters in the first convolutional layer. (Remember: parameters include weights and biases.)
(1) 现在计算前四层分别的输出尺寸:
- 卷积层 I:在相同填充的情况下,保持图像尺寸,只需要看滤波器数量
- 池化层 I:池化窗口两两不相交,总尺寸减半,通道数不变
- 卷积层 II:在无填充的情况下,步长为 ,高宽减少取决于滤波器尺寸 ,通道数取决于滤波器数量
- 池化层 II:池化窗口两两不相交,总尺寸减半,通道数不变
(2) 平铺层就是将上一步的输出拉直为一维向量,总元素数量为
(3) 在卷积层 I 中,根据卷积公式
其中 是输入图像, 是卷积核, 是偏置项。输出通道数取决于滤波器数量,对于第 个滤波器,其内部结构包含空间尺寸、输入通道数的权重和一个偏置项。因此,卷积层 I 的可训练参数总数为
# 2
Build your own residual net and train on the CIFAR-10 dataset, try your best (within your computing capability) to get the test accuracy to percent. Submit your code and training log.
ResNet 的基本思想是引入残差连接缓解梯度消失问题。以下是用 AI 辅助编写的 ResNet 实现代码:
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
import torch.optim as optim | |
import torchvision | |
import torchvision.transforms as transforms | |
import numpy as np | |
import time | |
import matplotlib.pyplot as plt | |
from torch.utils.data import DataLoader | |
from torch.optim.lr_scheduler import MultiStepLR, CosineAnnealingLR | |
# 检查 GPU 是否可用 | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
print(f"Using device: {device}") | |
# 设置随机种子以确保可重复性 | |
torch.manual_seed(42) | |
if torch.cuda.is_available(): | |
torch.cuda.manual_seed(42) | |
# ==================== 数据增强和预处理 ==================== | |
# 训练数据增强 | |
train_transform = transforms.Compose([ | |
transforms.RandomCrop(32, padding=4), | |
transforms.RandomHorizontalFlip(), | |
transforms.RandomRotation(10), | |
transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2), | |
transforms.ToTensor(), | |
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), | |
]) | |
# 测试数据预处理 | |
test_transform = transforms.Compose([ | |
transforms.ToTensor(), | |
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), | |
]) | |
# 加载 CIFAR-10 数据集 | |
batch_size = 128 | |
train_dataset = torchvision.datasets.CIFAR10( | |
root='./data', train=True, download=True, transform=train_transform | |
) | |
test_dataset = torchvision.datasets.CIFAR10( | |
root='./data', train=False, download=True, transform=test_transform | |
) | |
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True) | |
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True) | |
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') | |
print(f"训练集大小: {len(train_dataset)}") | |
print(f"测试集大小: {len(test_dataset)}") | |
# ==================== ResNet 模型定义 ==================== | |
class BasicBlock(nn.Module): | |
"""ResNet基础残差块""" | |
expansion = 1 | |
def __init__(self, in_channels, out_channels, stride=1): | |
super(BasicBlock, self).__init__() | |
# 第一个卷积层 | |
self.conv1 = nn.Conv2d( | |
in_channels, out_channels, kernel_size=3, | |
stride=stride, padding=1, bias=False | |
) | |
self.bn1 = nn.BatchNorm2d(out_channels) | |
# 第二个卷积层 | |
self.conv2 = nn.Conv2d( | |
out_channels, out_channels, kernel_size=3, | |
stride=1, padding=1, bias=False | |
) | |
self.bn2 = nn.BatchNorm2d(out_channels) | |
# 快捷连接 | |
self.shortcut = nn.Sequential() | |
if stride != 1 or in_channels != self.expansion * out_channels: | |
self.shortcut = nn.Sequential( | |
nn.Conv2d( | |
in_channels, self.expansion * out_channels, | |
kernel_size=1, stride=stride, bias=False | |
), | |
nn.BatchNorm2d(self.expansion * out_channels) | |
) | |
def forward(self, x): | |
identity = x | |
out = F.relu(self.bn1(self.conv1(x))) | |
out = self.bn2(self.conv2(out)) | |
out += self.shortcut(identity) | |
out = F.relu(out) | |
return out | |
class ResNet(nn.Module): | |
"""自定义ResNet模型""" | |
def __init__(self, block, num_blocks, num_classes=10): | |
super(ResNet, self).__init__() | |
self.in_channels = 64 | |
# 初始卷积层 | |
self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) | |
self.bn1 = nn.BatchNorm2d(64) | |
# 残差层 | |
self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) | |
self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) | |
self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) | |
self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) | |
# 全局平均池化和全连接层 | |
self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) | |
self.fc = nn.Linear(512 * block.expansion, num_classes) | |
# Dropout(可选,用于防止过拟合) | |
self.dropout = nn.Dropout(0.3) | |
def _make_layer(self, block, out_channels, num_blocks, stride): | |
strides = [stride] + [1] * (num_blocks - 1) | |
layers = [] | |
for stride in strides: | |
layers.append(block(self.in_channels, out_channels, stride)) | |
self.in_channels = out_channels * block.expansion | |
return nn.Sequential(*layers) | |
def forward(self, x): | |
out = F.relu(self.bn1(self.conv1(x))) | |
out = self.layer1(out) | |
out = self.layer2(out) | |
out = self.layer3(out) | |
out = self.layer4(out) | |
out = self.avgpool(out) | |
out = torch.flatten(out, 1) | |
out = self.dropout(out) | |
out = self.fc(out) | |
return out | |
def resnet18(): | |
"""创建ResNet-18模型""" | |
return ResNet(BasicBlock, [2, 2, 2, 2]) | |
def resnet34(): | |
"""创建ResNet-34模型""" | |
return ResNet(BasicBlock, [3, 4, 6, 3]) | |
# ==================== 训练和测试函数 ==================== | |
def train(model, device, train_loader, optimizer, criterion, epoch): | |
model.train() | |
train_loss = 0 | |
correct = 0 | |
total = 0 | |
for batch_idx, (data, target) in enumerate(train_loader): | |
data, target = data.to(device), target.to(device) | |
optimizer.zero_grad() | |
output = model(data) | |
loss = criterion(output, target) | |
loss.backward() | |
# 梯度裁剪防止梯度爆炸 | |
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) | |
optimizer.step() | |
train_loss += loss.item() | |
_, predicted = output.max(1) | |
total += target.size(0) | |
correct += predicted.eq(target).sum().item() | |
if batch_idx % 100 == 0: | |
print(f'Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ' | |
f'({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}') | |
train_accuracy = 100. * correct / total | |
avg_loss = train_loss / len(train_loader) | |
return avg_loss, train_accuracy | |
def test(model, device, test_loader, criterion): | |
model.eval() | |
test_loss = 0 | |
correct = 0 | |
total = 0 | |
with torch.no_grad(): | |
for data, target in test_loader: | |
data, target = data.to(device), target.to(device) | |
output = model(data) | |
test_loss += criterion(output, target).item() | |
_, predicted = output.max(1) | |
total += target.size(0) | |
correct += predicted.eq(target).sum().item() | |
test_accuracy = 100. * correct / total | |
avg_loss = test_loss / len(test_loader) | |
return avg_loss, test_accuracy | |
# ==================== 主训练循环 ==================== | |
def main(): | |
# 创建模型 | |
print("创建ResNet模型...") | |
model = resnet18().to(device) | |
# 打印模型参数数量 | |
total_params = sum(p.numel() for p in model.parameters()) | |
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) | |
print(f"模型总参数量: {total_params:,}") | |
print(f"可训练参数量: {trainable_params:,}") | |
# 定义损失函数和优化器 | |
criterion = nn.CrossEntropyLoss(label_smoothing=0.1) # 标签平滑防止过拟合 | |
optimizer = optim.SGD( | |
model.parameters(), | |
lr=0.1, # 初始学习率 | |
momentum=0.9, | |
weight_decay=5e-4 # L2 正则化 | |
) | |
# 学习率调度器 | |
scheduler = MultiStepLR(optimizer, milestones=[100, 150], gamma=0.1) | |
# 或者使用余弦退火 | |
# scheduler = CosineAnnealingLR(optimizer, T_max=200) | |
num_epochs = 200 | |
best_accuracy = 0 | |
best_model_state = None | |
# 用于记录训练过程 | |
train_losses, train_accs = [], [] | |
test_losses, test_accs = [], [] | |
print(f"\n开始训练,共{num_epochs}个epoch...") | |
print("="*60) | |
for epoch in range(1, num_epochs + 1): | |
start_time = time.time() | |
# 训练 | |
train_loss, train_acc = train(model, device, train_loader, optimizer, criterion, epoch) | |
train_losses.append(train_loss) | |
train_accs.append(train_acc) | |
# 测试 | |
test_loss, test_acc = test(model, device, test_loader, criterion) | |
test_losses.append(test_loss) | |
test_accs.append(test_acc) | |
# 更新学习率 | |
scheduler.step() | |
# 保存最佳模型 | |
if test_acc > best_accuracy: | |
best_accuracy = test_acc | |
best_model_state = model.state_dict().copy() | |
torch.save({ | |
'epoch': epoch, | |
'model_state_dict': best_model_state, | |
'optimizer_state_dict': optimizer.state_dict(), | |
'accuracy': best_accuracy, | |
}, 'best_model.pth') | |
epoch_time = time.time() - start_time | |
print(f'Epoch {epoch:3d}/{num_epochs} | ' | |
f'Time: {epoch_time:.2f}s | ' | |
f'Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}% | ' | |
f'Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.2f}% | ' | |
f'LR: {optimizer.param_groups[0]["lr"]:.6f}') | |
# 每 10 个 epoch 保存一次中间结果 | |
if epoch % 10 == 0: | |
torch.save({ | |
'epoch': epoch, | |
'model_state_dict': model.state_dict(), | |
'optimizer_state_dict': optimizer.state_dict(), | |
'train_losses': train_losses, | |
'test_accs': test_accs, | |
}, f'checkpoint_epoch_{epoch}.pth') | |
# 加载最佳模型 | |
model.load_state_dict(best_model_state) | |
# 最终测试 | |
final_loss, final_accuracy = test(model, device, test_loader, criterion) | |
print(f"\n{'='*60}") | |
print(f"训练完成!") | |
print(f"最佳测试准确率: {best_accuracy:.2f}%") | |
print(f"最终测试准确率: {final_accuracy:.2f}%") | |
# 绘制训练曲线 | |
plt.figure(figsize=(12, 4)) | |
plt.subplot(1, 2, 1) | |
plt.plot(train_losses, label='Train Loss') | |
plt.plot(test_losses, label='Test Loss') | |
plt.xlabel('Epoch') | |
plt.ylabel('Loss') | |
plt.legend() | |
plt.title('Training and Test Loss') | |
plt.subplot(1, 2, 2) | |
plt.plot(train_accs, label='Train Accuracy') | |
plt.plot(test_accs, label='Test Accuracy') | |
plt.xlabel('Epoch') | |
plt.ylabel('Accuracy (%)') | |
plt.legend() | |
plt.title('Training and Test Accuracy') | |
plt.tight_layout() | |
plt.savefig('training_curves.png') | |
plt.show() | |
return model, best_accuracy | |
if __name__ == '__main__': | |
model, best_acc = main() |
部分训练日志如下:
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
创建ResNet模型... | |
模型总参数量: 11,173,962 | |
可训练参数量: 11,173,962 | |
开始训练,共200个epoch... | |
============================================================ | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 1 [0/50000 (0%)] Loss: 2.598158 | |
Epoch: 1 [12800/50000 (26%)] Loss: 1.886283 | |
Epoch: 1 [25600/50000 (51%)] Loss: 1.757755 | |
Epoch: 1 [38400/50000 (77%)] Loss: 1.580445 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 1/200 | Time: 53.87s | Train Loss: 1.8160 | Train Acc: 39.76% | Test Loss: 1.4789 | Test Acc: 56.14% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 2 [0/50000 (0%)] Loss: 1.444861 | |
Epoch: 2 [12800/50000 (26%)] Loss: 1.479313 | |
Epoch: 2 [25600/50000 (51%)] Loss: 1.316082 | |
Epoch: 2 [38400/50000 (77%)] Loss: 1.387875 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 2/200 | Time: 53.57s | Train Loss: 1.3979 | Train Acc: 59.62% | Test Loss: 1.3437 | Test Acc: 62.63% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 3 [0/50000 (0%)] Loss: 1.537805 | |
Epoch: 3 [12800/50000 (26%)] Loss: 1.199809 | |
Epoch: 3 [25600/50000 (51%)] Loss: 1.230053 | |
Epoch: 3 [38400/50000 (77%)] Loss: 1.389225 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 3/200 | Time: 53.94s | Train Loss: 1.2286 | Train Acc: 67.93% | Test Loss: 1.2040 | Test Acc: 69.61% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 4 [0/50000 (0%)] Loss: 1.127333 | |
Epoch: 4 [12800/50000 (26%)] Loss: 1.309616 | |
Epoch: 4 [25600/50000 (51%)] Loss: 1.030421 | |
Epoch: 4 [38400/50000 (77%)] Loss: 1.079045 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 4/200 | Time: 53.66s | Train Loss: 1.1251 | Train Acc: 72.60% | Test Loss: 1.2645 | Test Acc: 68.30% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 5 [0/50000 (0%)] Loss: 1.137778 | |
Epoch: 5 [12800/50000 (26%)] Loss: 1.047550 | |
Epoch: 5 [25600/50000 (51%)] Loss: 0.974705 | |
Epoch: 5 [38400/50000 (77%)] Loss: 1.086789 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 5/200 | Time: 53.51s | Train Loss: 1.0631 | Train Acc: 75.61% | Test Loss: 1.0473 | Test Acc: 75.64% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 6 [0/50000 (0%)] Loss: 1.064054 | |
Epoch: 6 [12800/50000 (26%)] Loss: 0.963964 | |
Epoch: 6 [25600/50000 (51%)] Loss: 1.007503 | |
Epoch: 6 [38400/50000 (77%)] Loss: 0.969994 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 6/200 | Time: 54.46s | Train Loss: 1.0097 | Train Acc: 77.95% | Test Loss: 1.1400 | Test Acc: 72.96% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 7 [0/50000 (0%)] Loss: 0.895288 | |
Epoch: 7 [12800/50000 (26%)] Loss: 0.995895 | |
Epoch: 7 [25600/50000 (51%)] Loss: 0.936781 | |
Epoch: 7 [38400/50000 (77%)] Loss: 1.027178 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 7/200 | Time: 53.85s | Train Loss: 0.9801 | Train Acc: 79.23% | Test Loss: 1.0211 | Test Acc: 76.88% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 8 [0/50000 (0%)] Loss: 0.983641 | |
Epoch: 8 [12800/50000 (26%)] Loss: 0.804812 | |
Epoch: 8 [25600/50000 (51%)] Loss: 0.898298 | |
Epoch: 8 [38400/50000 (77%)] Loss: 0.971728 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 8/200 | Time: 55.00s | Train Loss: 0.9553 | Train Acc: 80.29% | Test Loss: 0.9534 | Test Acc: 80.80% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 9 [0/50000 (0%)] Loss: 0.874495 | |
Epoch: 9 [12800/50000 (26%)] Loss: 0.851649 | |
Epoch: 9 [25600/50000 (51%)] Loss: 0.977242 | |
Epoch: 9 [38400/50000 (77%)] Loss: 1.067239 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 9/200 | Time: 79.66s | Train Loss: 0.9405 | Train Acc: 81.11% | Test Loss: 0.9553 | Test Acc: 80.36% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 10 [0/50000 (0%)] Loss: 0.989358 | |
Epoch: 10 [12800/50000 (26%)] Loss: 0.845767 | |
Epoch: 10 [25600/50000 (51%)] Loss: 0.907775 | |
Epoch: 10 [38400/50000 (77%)] Loss: 1.033314 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 10/200 | Time: 53.40s | Train Loss: 0.9259 | Train Acc: 81.70% | Test Loss: 0.9687 | Test Acc: 79.80% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 11 [0/50000 (0%)] Loss: 1.002646 | |
Epoch: 11 [12800/50000 (26%)] Loss: 0.807637 | |
Epoch: 11 [25600/50000 (51%)] Loss: 0.912228 | |
Epoch: 11 [38400/50000 (77%)] Loss: 0.950749 | |
Epoch: 11 [38400/50000 (77%)] Loss: 0.950749 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 11/200 | Time: 53.85s | Train Loss: 0.9147 | Train Acc: 82.18% | Test Loss: 0.9289 | Test Acc: 81.39% | LR: 0.100000 | |
Epoch 11/200 | Time: 53.85s | Train Loss: 0.9147 | Train Acc: 82.18% | Test Loss: 0.9289 | Test Acc: 81.39% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 12 [0/50000 (0%)] Loss: 0.924864 | |
Epoch: 12 [12800/50000 (26%)] Loss: 0.942824 | |
Epoch: 12 [25600/50000 (51%)] Loss: 1.009633 | |
Epoch: 12 [38400/50000 (77%)] Loss: 0.862141 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 12/200 | Time: 54.52s | Train Loss: 0.9080 | Train Acc: 82.51% | Test Loss: 1.1401 | Test Acc: 72.76% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 13 [0/50000 (0%)] Loss: 0.934913 | |
Epoch: 13 [12800/50000 (26%)] Loss: 0.898304 | |
Epoch: 13 [25600/50000 (51%)] Loss: 0.896272 | |
Epoch: 13 [38400/50000 (77%)] Loss: 0.865075 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 13/200 | Time: 54.63s | Train Loss: 0.9009 | Train Acc: 82.75% | Test Loss: 0.8914 | Test Acc: 83.25% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 14 [0/50000 (0%)] Loss: 0.891441 | |
Epoch: 14 [12800/50000 (26%)] Loss: 0.907482 | |
Epoch: 14 [25600/50000 (51%)] Loss: 0.857055 | |
Epoch: 14 [38400/50000 (77%)] Loss: 0.914766 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 14/200 | Time: 54.15s | Train Loss: 0.8900 | Train Acc: 83.20% | Test Loss: 0.9459 | Test Acc: 81.01% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 15 [0/50000 (0%)] Loss: 0.800900 | |
Epoch: 15 [12800/50000 (26%)] Loss: 0.867119 | |
Epoch: 15 [25600/50000 (51%)] Loss: 0.853828 | |
Epoch: 15 [38400/50000 (77%)] Loss: 0.828357 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 15/200 | Time: 54.15s | Train Loss: 0.8804 | Train Acc: 83.69% | Test Loss: 0.9595 | Test Acc: 80.49% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 16 [0/50000 (0%)] Loss: 0.906890 | |
Epoch: 16 [12800/50000 (26%)] Loss: 0.887977 | |
Epoch: 16 [25600/50000 (51%)] Loss: 0.770198 | |
Epoch: 16 [38400/50000 (77%)] Loss: 0.843562 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 16/200 | Time: 54.12s | Train Loss: 0.8753 | Train Acc: 83.86% | Test Loss: 0.9450 | Test Acc: 80.98% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 17 [0/50000 (0%)] Loss: 0.875333 | |
Epoch: 17 [12800/50000 (26%)] Loss: 0.940536 | |
Epoch: 17 [25600/50000 (51%)] Loss: 0.926693 | |
Epoch: 17 [38400/50000 (77%)] Loss: 0.918440 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 17/200 | Time: 54.22s | Train Loss: 0.8747 | Train Acc: 83.97% | Test Loss: 0.9690 | Test Acc: 79.95% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 18 [0/50000 (0%)] Loss: 0.857610 | |
Epoch: 18 [12800/50000 (26%)] Loss: 0.872131 | |
Epoch: 18 [25600/50000 (51%)] Loss: 0.981058 | |
Epoch: 18 [38400/50000 (77%)] Loss: 0.863057 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 18/200 | Time: 53.98s | Train Loss: 0.8674 | Train Acc: 84.26% | Test Loss: 1.0064 | Test Acc: 78.28% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 19 [0/50000 (0%)] Loss: 0.860547 | |
Epoch: 19 [12800/50000 (26%)] Loss: 0.903203 | |
Epoch: 19 [25600/50000 (51%)] Loss: 0.822991 | |
Epoch: 19 [38400/50000 (77%)] Loss: 0.921520 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 19/200 | Time: 54.02s | Train Loss: 0.8631 | Train Acc: 84.58% | Test Loss: 1.0266 | Test Acc: 78.11% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 20 [0/50000 (0%)] Loss: 0.864497 | |
Epoch: 20 [12800/50000 (26%)] Loss: 0.826816 | |
Epoch: 20 [25600/50000 (51%)] Loss: 0.836563 | |
Epoch: 20 [38400/50000 (77%)] Loss: 0.812488 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 20/200 | Time: 54.56s | Train Loss: 0.8608 | Train Acc: 84.60% | Test Loss: 0.9077 | Test Acc: 82.32% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 21 [0/50000 (0%)] Loss: 0.795630 | |
Epoch: 21 [12800/50000 (26%)] Loss: 0.895831 | |
Epoch: 21 [25600/50000 (51%)] Loss: 0.943226 | |
Epoch: 21 [38400/50000 (77%)] Loss: 0.773665 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 21/200 | Time: 54.25s | Train Loss: 0.8601 | Train Acc: 84.66% | Test Loss: 1.0858 | Test Acc: 75.16% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 22 [0/50000 (0%)] Loss: 0.938416 | |
Epoch: 22 [12800/50000 (26%)] Loss: 0.888500 | |
Epoch: 22 [25600/50000 (51%)] Loss: 0.756124 | |
Epoch: 22 [38400/50000 (77%)] Loss: 0.843318 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 22/200 | Time: 56.05s | Train Loss: 0.8569 | Train Acc: 84.87% | Test Loss: 0.8957 | Test Acc: 83.12% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 23 [0/50000 (0%)] Loss: 0.914953 | |
Epoch: 23 [12800/50000 (26%)] Loss: 0.881937 | |
Epoch: 23 [25600/50000 (51%)] Loss: 0.935297 | |
Epoch: 23 [38400/50000 (77%)] Loss: 0.873078 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 23/200 | Time: 54.64s | Train Loss: 0.8535 | Train Acc: 84.86% | Test Loss: 0.9544 | Test Acc: 80.63% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 24 [0/50000 (0%)] Loss: 0.803802 | |
Epoch: 24 [12800/50000 (26%)] Loss: 0.791671 | |
Epoch: 24 [25600/50000 (51%)] Loss: 0.809881 | |
Epoch: 24 [38400/50000 (77%)] Loss: 0.838971 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 24/200 | Time: 54.93s | Train Loss: 0.8514 | Train Acc: 84.83% | Test Loss: 0.9577 | Test Acc: 79.69% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 25 [0/50000 (0%)] Loss: 0.858007 | |
Epoch: 25 [12800/50000 (26%)] Loss: 0.756386 | |
Epoch: 25 [25600/50000 (51%)] Loss: 0.863666 | |
Epoch: 25 [38400/50000 (77%)] Loss: 0.860152 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 25/200 | Time: 54.81s | Train Loss: 0.8496 | Train Acc: 85.14% | Test Loss: 0.9546 | Test Acc: 80.74% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 26 [0/50000 (0%)] Loss: 0.831551 | |
Epoch: 26 [12800/50000 (26%)] Loss: 0.916696 | |
Epoch: 26 [25600/50000 (51%)] Loss: 1.014461 | |
Epoch: 26 [38400/50000 (77%)] Loss: 0.929880 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 26/200 | Time: 54.58s | Train Loss: 0.8450 | Train Acc: 85.26% | Test Loss: 0.8793 | Test Acc: 83.58% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 27 [0/50000 (0%)] Loss: 0.840893 | |
Epoch: 27 [12800/50000 (26%)] Loss: 0.878673 | |
Epoch: 27 [25600/50000 (51%)] Loss: 0.806746 | |
Epoch: 27 [38400/50000 (77%)] Loss: 0.888433 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 27/200 | Time: 56.18s | Train Loss: 0.8430 | Train Acc: 85.37% | Test Loss: 0.8076 | Test Acc: 86.60% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 28 [0/50000 (0%)] Loss: 0.789495 | |
Epoch: 28 [12800/50000 (26%)] Loss: 0.762469 | |
Epoch: 28 [25600/50000 (51%)] Loss: 0.848810 | |
Epoch: 28 [38400/50000 (77%)] Loss: 0.874455 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 28/200 | Time: 72.53s | Train Loss: 0.8402 | Train Acc: 85.43% | Test Loss: 0.9131 | Test Acc: 82.04% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 29 [0/50000 (0%)] Loss: 0.784175 | |
Epoch: 29 [12800/50000 (26%)] Loss: 0.763392 | |
Epoch: 29 [25600/50000 (51%)] Loss: 0.715707 | |
Epoch: 29 [38400/50000 (77%)] Loss: 0.715120 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 29/200 | Time: 54.82s | Train Loss: 0.8369 | Train Acc: 85.72% | Test Loss: 0.8513 | Test Acc: 84.56% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 30 [0/50000 (0%)] Loss: 0.800729 | |
Epoch: 30 [12800/50000 (26%)] Loss: 0.822383 | |
Epoch: 30 [25600/50000 (51%)] Loss: 0.916324 | |
Epoch: 30 [38400/50000 (77%)] Loss: 0.796236 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 30/200 | Time: 54.51s | Train Loss: 0.8414 | Train Acc: 85.39% | Test Loss: 0.8714 | Test Acc: 84.37% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 31 [0/50000 (0%)] Loss: 0.848700 | |
Epoch: 31 [12800/50000 (26%)] Loss: 0.787045 | |
Epoch: 31 [25600/50000 (51%)] Loss: 0.847590 | |
Epoch: 31 [38400/50000 (77%)] Loss: 0.907628 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 31/200 | Time: 54.08s | Train Loss: 0.8357 | Train Acc: 85.66% | Test Loss: 0.9253 | Test Acc: 81.90% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 32 [0/50000 (0%)] Loss: 0.898938 | |
Epoch: 32 [12800/50000 (26%)] Loss: 0.834288 | |
Epoch: 32 [25600/50000 (51%)] Loss: 0.800032 | |
Epoch: 32 [38400/50000 (77%)] Loss: 0.911632 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 32/200 | Time: 54.14s | Train Loss: 0.8388 | Train Acc: 85.57% | Test Loss: 1.0133 | Test Acc: 78.52% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 33 [0/50000 (0%)] Loss: 0.774994 | |
Epoch: 33 [12800/50000 (26%)] Loss: 0.882923 | |
Epoch: 33 [25600/50000 (51%)] Loss: 0.812098 | |
Epoch: 33 [38400/50000 (77%)] Loss: 0.824945 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 33/200 | Time: 54.45s | Train Loss: 0.8301 | Train Acc: 86.01% | Test Loss: 0.9056 | Test Acc: 82.61% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 34 [0/50000 (0%)] Loss: 0.845768 | |
Epoch: 34 [12800/50000 (26%)] Loss: 0.827564 | |
Epoch: 34 [25600/50000 (51%)] Loss: 0.769887 | |
Epoch: 34 [38400/50000 (77%)] Loss: 0.800083 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 34/200 | Time: 54.80s | Train Loss: 0.8357 | Train Acc: 85.66% | Test Loss: 0.8552 | Test Acc: 84.82% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 35 [0/50000 (0%)] Loss: 0.846894 | |
Epoch: 35 [12800/50000 (26%)] Loss: 0.789980 | |
Epoch: 35 [25600/50000 (51%)] Loss: 0.807628 | |
Epoch: 35 [38400/50000 (77%)] Loss: 0.902017 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 35/200 | Time: 54.73s | Train Loss: 0.8322 | Train Acc: 85.80% | Test Loss: 0.8848 | Test Acc: 83.20% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 36 [0/50000 (0%)] Loss: 0.831893 | |
Epoch: 36 [12800/50000 (26%)] Loss: 0.780592 | |
Epoch: 36 [25600/50000 (51%)] Loss: 0.749251 | |
Epoch: 36 [38400/50000 (77%)] Loss: 0.876861 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 36/200 | Time: 54.61s | Train Loss: 0.8310 | Train Acc: 86.02% | Test Loss: 0.9535 | Test Acc: 81.04% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 37 [0/50000 (0%)] Loss: 0.917584 | |
Epoch: 37 [12800/50000 (26%)] Loss: 0.752700 | |
Epoch: 37 [25600/50000 (51%)] Loss: 0.834917 | |
Epoch: 37 [38400/50000 (77%)] Loss: 0.849060 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 37/200 | Time: 54.78s | Train Loss: 0.8295 | Train Acc: 85.89% | Test Loss: 0.8575 | Test Acc: 84.79% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 38 [0/50000 (0%)] Loss: 0.808229 | |
Epoch: 38 [12800/50000 (26%)] Loss: 0.764553 | |
Epoch: 38 [25600/50000 (51%)] Loss: 0.869413 | |
Epoch: 38 [38400/50000 (77%)] Loss: 0.975918 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 38/200 | Time: 54.38s | Train Loss: 0.8325 | Train Acc: 85.92% | Test Loss: 1.0467 | Test Acc: 77.26% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 39 [0/50000 (0%)] Loss: 0.792865 | |
Epoch: 39 [12800/50000 (26%)] Loss: 0.740830 | |
Epoch: 39 [25600/50000 (51%)] Loss: 0.903176 | |
Epoch: 39 [38400/50000 (77%)] Loss: 0.849468 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 39/200 | Time: 54.15s | Train Loss: 0.8291 | Train Acc: 85.95% | Test Loss: 0.8530 | Test Acc: 84.60% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 40 [0/50000 (0%)] Loss: 0.794266 | |
Epoch: 40 [12800/50000 (26%)] Loss: 0.965265 | |
Epoch: 40 [25600/50000 (51%)] Loss: 0.970391 | |
Epoch: 40 [38400/50000 (77%)] Loss: 0.820289 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 40/200 | Time: 54.60s | Train Loss: 0.8263 | Train Acc: 86.14% | Test Loss: 0.9158 | Test Acc: 82.22% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 41 [0/50000 (0%)] Loss: 0.859779 | |
Epoch: 41 [12800/50000 (26%)] Loss: 0.823970 | |
Epoch: 41 [25600/50000 (51%)] Loss: 0.845126 | |
Epoch: 41 [38400/50000 (77%)] Loss: 0.894197 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 41/200 | Time: 54.31s | Train Loss: 0.8295 | Train Acc: 86.02% | Test Loss: 0.9632 | Test Acc: 80.73% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 42 [0/50000 (0%)] Loss: 0.751721 | |
Epoch: 42 [12800/50000 (26%)] Loss: 0.927409 | |
Epoch: 42 [25600/50000 (51%)] Loss: 0.779975 | |
Epoch: 42 [38400/50000 (77%)] Loss: 0.827765 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 42/200 | Time: 55.41s | Train Loss: 0.8295 | Train Acc: 86.01% | Test Loss: 0.9369 | Test Acc: 81.37% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 43 [0/50000 (0%)] Loss: 0.812640 | |
Epoch: 43 [12800/50000 (26%)] Loss: 0.851625 | |
Epoch: 43 [25600/50000 (51%)] Loss: 0.821735 | |
Epoch: 43 [38400/50000 (77%)] Loss: 0.775307 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 43/200 | Time: 54.73s | Train Loss: 0.8267 | Train Acc: 86.21% | Test Loss: 0.8423 | Test Acc: 85.28% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 44 [0/50000 (0%)] Loss: 0.806267 | |
Epoch: 44 [12800/50000 (26%)] Loss: 0.920151 | |
Epoch: 44 [25600/50000 (51%)] Loss: 1.011200 | |
Epoch: 44 [38400/50000 (77%)] Loss: 0.837567 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 44/200 | Time: 54.33s | Train Loss: 0.8230 | Train Acc: 86.32% | Test Loss: 0.8519 | Test Acc: 84.80% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 45 [0/50000 (0%)] Loss: 0.808053 | |
Epoch: 45 [12800/50000 (26%)] Loss: 0.894567 | |
Epoch: 45 [25600/50000 (51%)] Loss: 0.771580 | |
Epoch: 45 [38400/50000 (77%)] Loss: 0.710069 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 45/200 | Time: 54.43s | Train Loss: 0.8222 | Train Acc: 86.32% | Test Loss: 0.8485 | Test Acc: 85.10% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 46 [0/50000 (0%)] Loss: 0.789186 | |
Epoch: 46 [12800/50000 (26%)] Loss: 0.834816 | |
Epoch: 46 [25600/50000 (51%)] Loss: 0.878065 | |
Epoch: 46 [38400/50000 (77%)] Loss: 0.829138 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 46/200 | Time: 54.23s | Train Loss: 0.8225 | Train Acc: 86.18% | Test Loss: 0.8551 | Test Acc: 84.87% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 47 [0/50000 (0%)] Loss: 0.801989 | |
Epoch: 47 [12800/50000 (26%)] Loss: 0.917647 | |
Epoch: 47 [25600/50000 (51%)] Loss: 0.761201 | |
Epoch: 47 [38400/50000 (77%)] Loss: 0.815080 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 47/200 | Time: 55.31s | Train Loss: 0.8207 | Train Acc: 86.42% | Test Loss: 0.8701 | Test Acc: 83.91% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 48 [0/50000 (0%)] Loss: 0.852317 | |
Epoch: 48 [12800/50000 (26%)] Loss: 0.847572 | |
Epoch: 48 [25600/50000 (51%)] Loss: 0.910867 | |
Epoch: 48 [38400/50000 (77%)] Loss: 0.904468 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 48/200 | Time: 54.46s | Train Loss: 0.8178 | Train Acc: 86.49% | Test Loss: 0.8769 | Test Acc: 83.87% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 49 [0/50000 (0%)] Loss: 0.732170 | |
Epoch: 49 [12800/50000 (26%)] Loss: 0.854018 | |
Epoch: 49 [25600/50000 (51%)] Loss: 0.850230 | |
Epoch: 49 [38400/50000 (77%)] Loss: 0.806912 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 49/200 | Time: 54.66s | Train Loss: 0.8210 | Train Acc: 86.41% | Test Loss: 0.9036 | Test Acc: 82.96% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 50 [0/50000 (0%)] Loss: 0.855476 | |
Epoch: 50 [12800/50000 (26%)] Loss: 0.802625 | |
Epoch: 50 [25600/50000 (51%)] Loss: 0.878814 | |
Epoch: 50 [38400/50000 (77%)] Loss: 0.725013 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 50/200 | Time: 54.50s | Train Loss: 0.8196 | Train Acc: 86.33% | Test Loss: 0.8937 | Test Acc: 83.23% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 51 [0/50000 (0%)] Loss: 0.792211 | |
Epoch: 51 [12800/50000 (26%)] Loss: 0.828319 | |
Epoch: 51 [25600/50000 (51%)] Loss: 0.758899 | |
Epoch: 51 [38400/50000 (77%)] Loss: 0.966560 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 51/200 | Time: 54.87s | Train Loss: 0.8188 | Train Acc: 86.33% | Test Loss: 0.9148 | Test Acc: 82.11% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 52 [0/50000 (0%)] Loss: 0.814625 | |
Epoch: 52 [12800/50000 (26%)] Loss: 0.810442 | |
Epoch: 52 [25600/50000 (51%)] Loss: 0.809587 | |
Epoch: 52 [38400/50000 (77%)] Loss: 0.787845 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 52/200 | Time: 54.67s | Train Loss: 0.8217 | Train Acc: 86.33% | Test Loss: 0.8117 | Test Acc: 86.61% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 53 [0/50000 (0%)] Loss: 0.751319 | |
Epoch: 53 [12800/50000 (26%)] Loss: 0.826305 | |
Epoch: 53 [25600/50000 (51%)] Loss: 0.735571 | |
Epoch: 53 [38400/50000 (77%)] Loss: 0.803900 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 53/200 | Time: 55.02s | Train Loss: 0.8160 | Train Acc: 86.40% | Test Loss: 0.8388 | Test Acc: 85.01% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 54 [0/50000 (0%)] Loss: 0.884010 | |
Epoch: 54 [12800/50000 (26%)] Loss: 0.844160 | |
Epoch: 54 [25600/50000 (51%)] Loss: 0.874899 | |
Epoch: 54 [38400/50000 (77%)] Loss: 0.785140 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 54/200 | Time: 55.07s | Train Loss: 0.8208 | Train Acc: 86.37% | Test Loss: 0.8847 | Test Acc: 83.48% | LR: 0.100000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch: 55 [0/50000 (0%)] Loss: 0.747851 | |
Epoch: 55 [12800/50000 (26%)] Loss: 0.796169 | |
Epoch: 55 [25600/50000 (51%)] Loss: 0.835408 | |
Epoch: 55 [38400/50000 (77%)] Loss: 1.008027 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Using device: cuda | |
训练集大小: 50000 | |
测试集大小: 10000 | |
Epoch 55/200 | Time: 54.39s | Train Loss: 0.8207 | Train Acc: 86.27% | Test Loss: 0.8297 | Test Acc: 85.93% | LR: 0.100000 |
# 3
Consider a simple RNN cell defined by the following equations:
where:
- is the input at time step ;
- is the hidden state at time step ( is initialized to zeros);
- is the output at time step .
Given an input sequence :
- Draw the computational graph by unrolling the RNN for all three time steps. Clearly show the inputs , hidden states , outputs , and the sharing of weights , , and across time.
- Using your graph, explain how the hidden stat depends on the entire input sequence . Why is the hidden state considered the "memory" of the network?
(1) 交换图如下
(2) 展开 的计算过程:
或者从图模型的角度出发,可以看出 和 之间存在路径连接,因此 依赖于整个输入序列 。上述给出了 的计算表达式,其实对 都是类似的,它们都包含了前面所有时间步的输入信息,所以承担了记忆过去时间步信息的角色,因此被称为网络的 “记忆”。具体地记忆调整取决于权重矩阵 和 的学习过程。激活函数 tanh 用来重新编码信息,而权重矩阵则决定了信息的存储和遗忘倾向。
# 4
The RNN network was described in last problem.
- During Backpropagation Through Time (BPTT), the gradient of the loss with respect to an early hidden state involves a long chain of derivatives. Write the expression for in terms of , showing the chain rule through .
- The tanh derivative is , which is always . Explain how repeated multiplication of these derivatives (and the weight matrix ) during BPTT can cause the vanishing gradient problem.
- What is the negative consequence of vanishing gradients for an RNN's ability to learn longrange dependencies in sequences?
(1) 根据图模型的链式法则,如果考虑随时间反向传播,则
其中 一般是向量,因此上述过程实际上是矩阵求导和矩阵乘法。中间量计算如下,因为激活函数 是逐元素应用的,所以
所以表达式可以代入,写为
(2) 在 BPTT 过程中,反向传播算法一般写成连乘的形式,如上所示,而每一步中 的元素值都小于等于 ,如果对长序列采用这个架构,那么计算出来的梯度会以指数量级衰减,在计算机里可能会因为精度问题而变成 ,这就是梯度消失问题。
(3) 梯度消失会导致网络无法通过梯度下降法有效地更新早期时间步的参数,从而使得模型难以学习和记忆长距离依赖关系,直观上就是 “记忆能力有限”。这意味着 RNN 在处理长序列时,可能无法捕捉到序列中远距离元素之间的重要联系,影响模型的性能和泛化能力。好在 LSTM 和 GRU 通过引入门控机制,将梯度的乘积算法改进成了线性组合,从而缓解了梯度消失问题。
# 5
Using the attached dataset (input.txt), and using the RNN with the GRU cell to train a language model. Submit your model and the sampling results.
见附件中 Best_GRU_Model.pth 以及 single_sample.txt。
# 6
Given the fundamental attention equation:
Consider the following input matrices:
where , , , and .
- Compute the raw attention scores and show all steps.
- Apply the scaling factor to the raw scores.
- Calculate the attention weights by applying the softmax function to each row of the scaled scores. Show your work for at least one row.
- Compute the final output by multiplying the attention weights with .
- What is the shape of the final output matrix? Explain why this shape makes sense given the inputs.
(1) 计算 :
(2) 应用缩放因子 :
(3) Softmax 的表达式为
对第一行应用 Softmax:
同理可以计算剩下两行,得到注意力权重矩阵:
(4) 计算最终输出:
(5) 最终输出矩阵的形状是 。这是合理的,因为输入的查询矩阵 有 行(表示 个查询向量),而值矩阵 有 列(表示每个值向量的维度)。注意力机制的输出应该与查询的数量相匹配,同时保留值向量的维度,因此输出形状为 是符合预期的。
# 7
The multi-head attention mechanism is defined as:
where each head is computed as:
Given:
- (sequence length 4, embedding dimension 8),
- Number of heads ,
- .
- What must be the dimensions of the projection matrices , and ?
- If the output of the concatenated heads is , what operations are needed to ensure the final output has the same dimensions as the input?
- Write the complete mathematical expression for one head of the multi-head attention, showing all matrix dimensions.
(1) 分头数为 ,每个头的维度为 。因此,投影矩阵将词元
- :将查询从维度 投影到维度 ,所以 ;
- :将键从维度 投影到维度 ,所以 ;
- :将值从维度 投影到维度 ,所以 ;
- :将连接后的头从维度 投影回维度 ,所以 。
(2) 连接后的头 已经是 的矩阵,与输入 的维度相同,因此不需要额外的操作来调整维度。只需将 乘以输出投影矩阵 即可得到最终输出,保持维度为 。
(3) 对于一个头 ,其完整的数学表达式为(而后转入维度计算):
# 8
Consider the gradient flow through the attention mechanism. Let be the loss function.
- Using the chain rule, express in terms of
- ,
- The attention weights matrix ,
- The value matrix ,
- The key matrix .
- The softmax derivative for a vector is below, where is the Kronecker delta. How does this affect gradient computation in attention?
- Explain why the scaling factor helps with gradient stability during training.
(1) 根据链式法则(是否有问题?)
(2) Softmax 的导数形式表明,梯度的计算不仅依赖于当前元素的 softmax 值,还依赖于其他元素的 softmax 值。这意味着在计算梯度时,注意力权重之间存在相互影响,这可能导致梯度在某些情况下变得不稳定,尤其是在处理大规模数据或长序列时。
(3) 缩放因子 有助于稳定梯度,因为它减小了点积 的幅度,防止其值过大。过大的点积会导致 softmax 函数的输出趋近于极端值(接近 或 ),从而使得梯度变得非常小,导致梯度消失问题。通过缩放,可以保持点积在一个合理的范围内,从而确保 softmax 输出的分布更加均匀,有助于梯度的稳定传播和有效更新模型参数。
