第J2周：ResNet50V2 算法实战与解析

🍨 本文为🔗365天深度学习训练营中的学习记录博客
🍖 原作者：K同学啊

学习目标

✅ 根据TensorFlow代码，编写出相应的Python代码
✅ 了解ResNetV2和ResNet模型的区别

一、环境配置

在这里插入图片描述

二、数据预处理

在这里插入图片描述

三、创建、划分数据集

在这里插入图片描述

四、创建数据加载器

在这里插入图片描述

五、加载预训练

在这里插入图片描述

六、显示训练数据

# 显示一些训练图像示例
def show_images(loader, title="数据示例"):
    plt.figure(figsize=(12, 8))
    plt.suptitle(title, fontsize=16)
    
    try:
        for batch_idx, (images, labels) in enumerate(loader):
            images = images[:12]
            labels = labels[:12]
            break
        
        for i in range(min(12, len(images))):
            ax = plt.subplot(3, 4, i + 1)
            img = images[i].cpu().numpy().transpose((1, 2, 0))
            # 反标准化
            mean = np.array([0.485, 0.456, 0.406])
            std = np.array([0.229, 0.224, 0.225])
            img = std * img + mean
            img = np.clip(img, 0, 1)
            ax.imshow(img)
            ax.set_title(f"{dataset.classes[labels[i]]}", fontsize=10)
            ax.axis("off")
        
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"无法显示图像示例: {e}")
        plt.close()

# 显示训练数据示例
print("\n显示训练数据示例...")
show_images(train_loader, "训练数据示例（无数据增强）")

七、编写训练函数、测试函数、设置早停机制

# 训练函数
def train_epoch(model, device, train_loader, optimizer, criterion):
    model.train()
    running_loss = 0.0
    running_corrects = 0
    total = 0
    
    for batch_idx, (inputs, labels) in enumerate(train_loader):
        try:
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            _, preds = torch.max(outputs, 1)
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)
            total += labels.size(0)
            
            if batch_idx % 10 == 0:
                print(f'Batch: {batch_idx}/{len(train_loader)}, Loss: {loss.item():.4f}')
            
            # 清理GPU内存
            del inputs, labels, outputs, loss
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            
        except Exception as e:
            print(f"训练批次 {batch_idx} 出现错误: {e}")
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            continue
    
    epoch_loss = running_loss / total if total > 0 else float('inf')
    epoch_acc = running_corrects.double() / total if total > 0 else 0
    
    return epoch_loss, epoch_acc

# 验证函数
def validate(model, device, val_loader, criterion):
    model.eval()
    running_loss = 0.0
    running_corrects = 0
    total = 0
    
    with torch.no_grad():
        for inputs, labels in val_loader:
            try:
                inputs, labels = inputs.to(device), labels.to(device)
                
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                
                _, preds = torch.max(outputs, 1)
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                total += labels.size(0)
                
                # 清理内存
                del inputs, labels, outputs, loss
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                
            except Exception as e:
                print(f"验证过程中出现错误: {e}")
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                continue
    
    epoch_loss = running_loss / total if total > 0 else float('inf')
    epoch_acc = running_corrects.double() / total if total > 0 else 0
    
    return epoch_loss, epoch_acc

# 早停机制
class EarlyStopping:
    def __init__(self, patience=7, verbose=False, delta=0.001, path='best_resnet50v2.pth'):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

八、开始训练

# 开始训练
print("\n开始训练...")
num_epochs = 25
early_stopping = EarlyStopping(patience=5, verbose=True)

train_losses = []
train_accs = []
val_losses = []
val_accs = []
lr_history = []

for epoch in range(num_epochs):
    print(f'\nEpoch {epoch+1}/{num_epochs}')
    print('-' * 10)
    
    try:
        # 记录当前学习率
        current_lr = optimizer.param_groups[0]['lr']
        lr_history.append(current_lr)
        
        # 训练阶段
        train_loss, train_acc = train_epoch(model, device, train_loader, optimizer, criterion)
        train_losses.append(train_loss)
        train_accs.append(train_acc.item())
        
        # 验证阶段
        val_loss, val_acc = validate(model, device, val_loader, criterion)
        val_losses.append(val_loss)
        val_accs.append(val_acc.item())
        
        # 更新学习率
        scheduler.step()
        
        # 打印结果
        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc*100:.2f}%')
        print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc*100:.2f}%')
        print(f'Learning Rate: {current_lr:.6f}')
        
        # 早停检查
        early_stopping(val_loss, model)
        if early_stopping.early_stop:
            print("Early stopping")
            break
            
    except Exception as e:
        print(f"Epoch {epoch+1} 出现错误: {e}")
        print("尝试清理内存并继续...")
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        continue

print("\n训练完成!")

开始训练...

Epoch 1/25
----------
Batch: 0/28, Loss: 1.3802
Batch: 10/28, Loss: 1.0213
Batch: 20/28, Loss: 0.4903
Train Loss: 0.8478, Train Acc: 85.49%
Val Loss: 0.1881, Val Acc: 99.12%
Learning Rate: 0.000100
Validation loss decreased (inf --> 0.188087).  Saving model ...

Epoch 2/25
----------
Batch: 0/28, Loss: 0.2666
Batch: 10/28, Loss: 0.0800
Batch: 20/28, Loss: 0.0263
Train Loss: 0.0922, Train Acc: 99.55%
Val Loss: 0.0231, Val Acc: 100.00%
Learning Rate: 0.000100
Validation loss decreased (0.188087 --> 0.023144).  Saving model ...

Epoch 3/25
----------
Batch: 0/28, Loss: 0.0139
Batch: 10/28, Loss: 0.0292
Batch: 20/28, Loss: 0.0233
Train Loss: 0.0256, Train Acc: 100.00%
Val Loss: 0.0150, Val Acc: 100.00%
Learning Rate: 0.000100
Validation loss decreased (0.023144 --> 0.015029).  Saving model ...

Epoch 4/25
----------
Batch: 0/28, Loss: 0.0086
Batch: 10/28, Loss: 0.0055
Batch: 20/28, Loss: 0.0183
Train Loss: 0.0079, Train Acc: 100.00%
Val Loss: 0.0113, Val Acc: 100.00%
Learning Rate: 0.000100
Validation loss decreased (0.015029 --> 0.011310).  Saving model ...

Epoch 5/25
----------
Batch: 0/28, Loss: 0.0280
Batch: 10/28, Loss: 0.0064
Batch: 20/28, Loss: 0.0134
Train Loss: 0.0059, Train Acc: 100.00%
Val Loss: 0.0104, Val Acc: 100.00%
Learning Rate: 0.000100
EarlyStopping counter: 1 out of 5

Epoch 6/25
----------
Batch: 0/28, Loss: 0.0039
Batch: 10/28, Loss: 0.0042
Batch: 20/28, Loss: 0.0041
Train Loss: 0.0209, Train Acc: 99.78%
Val Loss: 0.0122, Val Acc: 100.00%
Learning Rate: 0.000100
EarlyStopping counter: 2 out of 5

Epoch 7/25
----------
Batch: 0/28, Loss: 0.0015
Batch: 10/28, Loss: 0.0026
Batch: 20/28, Loss: 0.0007
Train Loss: 0.0171, Train Acc: 99.78%
Val Loss: 0.0115, Val Acc: 100.00%
Learning Rate: 0.000100
EarlyStopping counter: 3 out of 5

Epoch 8/25
----------
Batch: 0/28, Loss: 0.0041
Batch: 10/28, Loss: 0.0077
Batch: 20/28, Loss: 0.0032
Train Loss: 0.0071, Train Acc: 100.00%
Val Loss: 0.0112, Val Acc: 100.00%
Learning Rate: 0.000050
EarlyStopping counter: 4 out of 5

Epoch 9/25
----------
Batch: 0/28, Loss: 0.0051
Batch: 10/28, Loss: 0.0014
Batch: 20/28, Loss: 0.0037
Train Loss: 0.0026, Train Acc: 100.00%
Val Loss: 0.0091, Val Acc: 100.00%
Learning Rate: 0.000050
Validation loss decreased (0.011310 --> 0.009129).  Saving model ...

Epoch 10/25
----------
Batch: 0/28, Loss: 0.0164
Batch: 10/28, Loss: 0.0057
Batch: 20/28, Loss: 0.0015
Train Loss: 0.0058, Train Acc: 100.00%
Val Loss: 0.0080, Val Acc: 100.00%
Learning Rate: 0.000050
Validation loss decreased (0.009129 --> 0.008041).  Saving model ...

Epoch 11/25
----------
Batch: 0/28, Loss: 0.0019
Batch: 10/28, Loss: 0.0017
Batch: 20/28, Loss: 0.0009
Train Loss: 0.0197, Train Acc: 99.78%
Val Loss: 0.0116, Val Acc: 100.00%
Learning Rate: 0.000050
EarlyStopping counter: 1 out of 5

Epoch 12/25
----------
Batch: 0/28, Loss: 0.0011
Batch: 10/28, Loss: 0.0018
Batch: 20/28, Loss: 0.0016
Train Loss: 0.0030, Train Acc: 100.00%
Val Loss: 0.0123, Val Acc: 100.00%
Learning Rate: 0.000050
EarlyStopping counter: 2 out of 5

Epoch 13/25
----------
Batch: 0/28, Loss: 0.0019
Batch: 10/28, Loss: 0.0079
Batch: 20/28, Loss: 0.0068
Train Loss: 0.0040, Train Acc: 100.00%
Val Loss: 0.0085, Val Acc: 100.00%
Learning Rate: 0.000050
EarlyStopping counter: 3 out of 5

Epoch 14/25
----------
Batch: 0/28, Loss: 0.0009
Batch: 10/28, Loss: 0.0091
Batch: 20/28, Loss: 0.0007
Train Loss: 0.0052, Train Acc: 99.78%
Val Loss: 0.0106, Val Acc: 100.00%
Learning Rate: 0.000050
EarlyStopping counter: 4 out of 5

Epoch 15/25
----------
Batch: 0/28, Loss: 0.0020
Batch: 10/28, Loss: 0.0004
Batch: 20/28, Loss: 0.0061
Train Loss: 0.0026, Train Acc: 100.00%
Val Loss: 0.0094, Val Acc: 100.00%
Learning Rate: 0.000025
EarlyStopping counter: 5 out of 5
Early stopping

训练完成!

九、评估

# 绘制训练曲线
plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
plt.plot(range(1, len(train_losses)+1), train_losses, label='Train Loss', marker='o')
plt.plot(range(1, len(val_losses)+1), val_losses, label='Val Loss', marker='s')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)

plt.subplot(2, 2, 2)
plt.plot(range(1, len(train_accs)+1), [acc*100 for acc in train_accs], label='Train Acc', marker='o')
plt.plot(range(1, len(val_accs)+1), [acc*100 for acc in val_accs], label='Val Acc', marker='s')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.grid(True)

plt.subplot(2, 2, 3)
plt.plot(range(1, len(lr_history)+1), lr_history, marker='o')
plt.xlabel('Epoch')
plt.ylabel('Learning Rate')
plt.title('Learning Rate Schedule')
plt.yscale('log')
plt.grid(True)

plt.subplot(2, 2, 4)

在这里插入图片描述

十、总结

ResNet v2的主要改进点。可能包括：1. 使用预激活结构，把BatchNorm和ReLU放在卷积之前。这样在残差块中，激活函数先于卷积层应用，这样梯度流动更顺畅，更有效地解决梯度消失的问题。2. 可能还调整了残差路径的设计，比如在跳跃连接中避免不必要的操作，保持恒等映射。3. 可能还涉及到了更深的网络结构或者更高效的训练方法。

然后，我需要总结ResNet v2的架构变化。比如，原来的残差块是conv-BN-ReLU的序列，而v2可能调整为BN-ReLU-conv的顺序，也就是预激活。这样的调整如何影响网络的表现？比如训练更稳定，准确率更高。此外，可能还探讨了不同的残差块设计，比如是否在跳跃连接中加入额外的卷积或BN层，而v2可能简化了这部分，保持跳跃连接尽可能简单，以保留恒等映射的特性。

另外，ResNet v2可能还进行了实验验证，比如在CIFAR-10、ImageNet等数据集上的表现，证明其有效性。比如，更深的网络（如1000层）是否能够稳定训练，准确率是否有提升。

需要注意的是，用户可能需要对比ResNet v1和v2的区别，所以总结时应该明确列出这些区别。比如预激活结构、恒等映射的改进、更优的梯度流动等。