当前位置: 首页 > news >正文

DAY 45 超大力王爱学Python

来自超大力王的友情提示:在用tensordoard的时候一定一定要用绝对位置,例如:tensorboard --logdir="D:\代码\archive (1)\runs\cifar10_mlp_experiment_2" 不然读取不了数据

知识点回顾:

  1. tensorboard的发展历史和原理
  2. tensorboard的常见操作
  3. tensorboard在cifar上的实战:MLP和CNN模型

效果展示如下,很适合拿去组会汇报撑页数:

作业:对resnet18在cifar10上采用微调策略下,用tensorboard监控训练过程。

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import matplotlib.pyplot as plt
import os# 设置随机种子以确保结果可复现
torch.manual_seed(42)
np.random.seed(42)
torch.backends.cudnn.deterministic = True# 1. 数据预处理
transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])# 2. 加载CIFAR-10数据集
train_dataset = datasets.CIFAR10(root='./data',train=True,download=True,transform=transform
)test_dataset = datasets.CIFAR10(root='./data',train=False,transform=transform
)# 3. 创建数据加载器
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)# CIFAR-10的类别名称
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')# 4. 定义MLP模型
class MLP(nn.Module):def __init__(self):super(MLP, self).__init__()self.flatten = nn.Flatten()self.layer1 = nn.Linear(3072, 512)self.relu1 = nn.ReLU()self.dropout1 = nn.Dropout(0.2)self.layer2 = nn.Linear(512, 256)self.relu2 = nn.ReLU()self.dropout2 = nn.Dropout(0.2)self.layer3 = nn.Linear(256, 10)def forward(self, x):x = self.flatten(x)x = self.layer1(x)x = self.relu1(x)x = self.dropout1(x)x = self.layer2(x)x = self.relu2(x)x = self.dropout2(x)x = self.layer3(x)return x# 检查GPU是否可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")# 初始化模型
model = MLP()
model = model.to(device)criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)# 创建TensorBoard的SummaryWriter
log_dir = 'runs/cifar10_mlp_experiment'
# 自动生成唯一目录(避免覆盖)
if os.path.exists(log_dir):i = 1while os.path.exists(f"{log_dir}_{i}"):i += 1log_dir = f"{log_dir}_{i}"# 创建日志目录并验证
os.makedirs(log_dir, exist_ok=True)
print(f"TensorBoard日志将保存在: {log_dir}")# 检查目录是否创建成功
if not os.path.exists(log_dir):raise FileNotFoundError(f"无法创建日志目录: {log_dir}")writer = SummaryWriter(log_dir)# 模型保存路径
model_save_dir = 'saved_models'
os.makedirs(model_save_dir, exist_ok=True)
best_model_path = os.path.join(model_save_dir, 'best_model.pth')
final_model_path = os.path.join(model_save_dir, 'final_model.pth')# 5. 训练模型(优化TensorBoard日志写入)
def train(model, train_loader, test_loader, criterion, optimizer, scheduler, device, epochs, writer):model.train()best_accuracy = 0.0global_step = 0# 可视化模型结构(添加错误处理和调试打印)try:dataiter = iter(train_loader)images, labels = next(dataiter)images = images.to(device)writer.add_graph(model, images)print(f"✅ 已记录模型图至TensorBoard")print(f"成功获取训练数据批次,图像尺寸: {images.shape}")  # 调试打印except Exception as e:print(f"⚠️ 模型图记录失败: {e}")return 0.0  # 训练失败时提前返回# 可视化原始图像样本(添加调试打印)img_grid = torchvision.utils.make_grid(images[:8].cpu())writer.add_image('原始训练图像', img_grid, global_step=0)print(f"✅ 已记录原始图像至TensorBoard")for epoch in range(epochs):running_loss = 0.0correct = 0total = 0for batch_idx, (data, target) in enumerate(train_loader):data, target = data.to(device), target.to(device)optimizer.zero_grad()output = model(data)loss = criterion(output, target)loss.backward()optimizer.step()# 统计准确率和损失running_loss += loss.item()_, predicted = output.max(1)total += target.size(0)correct += predicted.eq(target).sum().item()# 每批次都记录日志(简化调试)writer.add_scalar('Train/Batch_Loss', loss.item(), global_step)writer.add_scalar('Train/Batch_Accuracy', 100. * correct / total, global_step)writer.add_scalar('Train/Learning_Rate', optimizer.param_groups[0]['lr'], global_step)# 每100个批次打印一次信息if (batch_idx + 1) % 100 == 0:batch_loss = loss.item()print(f'Epoch: {epoch+1}/{epochs} | Batch: {batch_idx+1}/{len(train_loader)} 'f'| Batch损失: {batch_loss:.4f} | 累计平均损失: {running_loss/(batch_idx+1):.4f}')global_step += 1# 更新学习率scheduler.step()# 计算当前epoch的平均训练损失和准确率epoch_train_loss = running_loss / len(train_loader)epoch_train_acc = 100. * correct / total# 记录每个epoch的训练指标writer.add_scalar('Train/Epoch_Loss', epoch_train_loss, epoch)writer.add_scalar('Train/Epoch_Accuracy', epoch_train_acc, epoch)# 测试阶段model.eval()test_loss = 0correct_test = 0total_test = 0# 限制错误样本数量max_wrong_samples = 100wrong_images = []wrong_labels = []wrong_preds = []with torch.no_grad():for data, target in test_loader:data, target = data.to(device), target.to(device)output = model(data)test_loss += criterion(output, target).item()_, predicted = output.max(1)total_test += target.size(0)correct_test += predicted.eq(target).sum().item()# 收集错误样本wrong_mask = (predicted != target).cpu()if wrong_mask.sum() > 0 and len(wrong_images) < max_wrong_samples:num_add = min(max_wrong_samples - len(wrong_images), wrong_mask.sum())wrong_batch_images = data[wrong_mask][:num_add].cpu()wrong_batch_labels = target[wrong_mask][:num_add].cpu()wrong_batch_preds = predicted[wrong_mask][:num_add].cpu()wrong_images.extend(wrong_batch_images)wrong_labels.extend(wrong_batch_labels)wrong_preds.extend(wrong_batch_preds)epoch_test_loss = test_loss / len(test_loader)epoch_test_acc = 100. * correct_test / total_test# 记录每个epoch的测试指标writer.add_scalar('Test/Loss', epoch_test_loss, epoch)writer.add_scalar('Test/Accuracy', epoch_test_acc, epoch)print(f'Epoch {epoch+1}/{epochs} 完成 | 训练准确率: {epoch_train_acc:.2f}% | 测试准确率: {epoch_test_acc:.2f}%')# 保存最佳模型if epoch_test_acc > best_accuracy:best_accuracy = epoch_test_acctorch.save(model.state_dict(), best_model_path)print(f"✅ 保存最佳模型(准确率: {best_accuracy:.2f}%)")# 可视化错误预测样本(每个epoch都记录,便于观察)if len(wrong_images) > 0:display_count = min(8, len(wrong_images))wrong_img_grid = torchvision.utils.make_grid(wrong_images[:display_count])wrong_text = []for i in range(display_count):true_label = classes[wrong_labels[i]]pred_label = classes[wrong_preds[i]]wrong_text.append(f'True: {true_label}, Pred: {pred_label}')writer.add_image('错误预测样本', wrong_img_grid, global_step=epoch)writer.add_text('错误预测标签', '\n'.join(wrong_text), global_step=epoch)model.train()  # 切回训练模式# 保存最终模型torch.save(model.state_dict(), final_model_path)print(f"模型已保存至 {final_model_path}")# 刷新并关闭TensorBoard写入器writer.flush()writer.close()return best_accuracy# Windows环境下的多进程兼容处理
if __name__ == "__main__":import torch.multiprocessing as mpmp.set_start_method('spawn')  # 关键修改:解决Windows多进程问题epochs = 20print("开始训练模型...")print(f"TensorBoard日志保存在: {log_dir}")print("训练完成后,使用命令 `tensorboard --logdir={log_dir}` 启动TensorBoard")final_accuracy = train(model, train_loader, test_loader, criterion, optimizer, scheduler, device, epochs, writer)print(f"训练完成!最佳测试准确率: {final_accuracy:.2f}%")
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import matplotlib.pyplot as plt
import os# 设置随机种子以确保结果可复现
torch.manual_seed(42)
np.random.seed(42)
torch.backends.cudnn.deterministic = True# 1. 数据预处理
transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])# 2. 加载CIFAR-10数据集
train_dataset = datasets.CIFAR10(root='./data',train=True,download=True,transform=transform
)test_dataset = datasets.CIFAR10(root='./data',train=False,transform=transform
)# 3. 创建数据加载器
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)# CIFAR-10的类别名称
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')# 4. 定义MLP模型
class MLP(nn.Module):def __init__(self):super(MLP, self).__init__()self.flatten = nn.Flatten()self.layer1 = nn.Linear(3072, 512)self.relu1 = nn.ReLU()self.dropout1 = nn.Dropout(0.2)self.layer2 = nn.Linear(512, 256)self.relu2 = nn.ReLU()self.dropout2 = nn.Dropout(0.2)self.layer3 = nn.Linear(256, 10)def forward(self, x):x = self.flatten(x)x = self.layer1(x)x = self.relu1(x)x = self.dropout1(x)x = self.layer2(x)x = self.relu2(x)x = self.dropout2(x)x = self.layer3(x)return x# 检查GPU是否可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")# 初始化模型
model = MLP()
model = model.to(device)criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)# 创建TensorBoard的SummaryWriter
log_dir = 'runs/cifar10_mlp_experiment'
# 自动生成唯一目录(避免覆盖)
if os.path.exists(log_dir):i = 1while os.path.exists(f"{log_dir}_{i}"):i += 1log_dir = f"{log_dir}_{i}"# 创建日志目录并验证
os.makedirs(log_dir, exist_ok=True)
print(f"TensorBoard日志将保存在: {log_dir}")# 检查目录是否创建成功
if not os.path.exists(log_dir):raise FileNotFoundError(f"无法创建日志目录: {log_dir}")writer = SummaryWriter(log_dir)# 模型保存路径
model_save_dir = 'saved_models'
os.makedirs(model_save_dir, exist_ok=True)
best_model_path = os.path.join(model_save_dir, 'best_model.pth')
final_model_path = os.path.join(model_save_dir, 'final_model.pth')# 5. 训练模型(优化TensorBoard日志写入)
def train(model, train_loader, test_loader, criterion, optimizer, scheduler, device, epochs, writer):model.train()best_accuracy = 0.0global_step = 0# 可视化模型结构(添加错误处理和调试打印)try:dataiter = iter(train_loader)images, labels = next(dataiter)images = images.to(device)writer.add_graph(model, images)print(f"✅ 已记录模型图至TensorBoard")print(f"成功获取训练数据批次,图像尺寸: {images.shape}")  # 调试打印except Exception as e:print(f"⚠️ 模型图记录失败: {e}")return 0.0  # 训练失败时提前返回# 可视化原始图像样本(添加调试打印)img_grid = torchvision.utils.make_grid(images[:8].cpu())writer.add_image('原始训练图像', img_grid, global_step=0)print(f"✅ 已记录原始图像至TensorBoard")for epoch in range(epochs):running_loss = 0.0correct = 0total = 0for batch_idx, (data, target) in enumerate(train_loader):data, target = data.to(device), target.to(device)optimizer.zero_grad()output = model(data)loss = criterion(output, target)loss.backward()optimizer.step()# 统计准确率和损失running_loss += loss.item()_, predicted = output.max(1)total += target.size(0)correct += predicted.eq(target).sum().item()# 每批次都记录日志(简化调试)writer.add_scalar('Train/Batch_Loss', loss.item(), global_step)writer.add_scalar('Train/Batch_Accuracy', 100. * correct / total, global_step)writer.add_scalar('Train/Learning_Rate', optimizer.param_groups[0]['lr'], global_step)# 每100个批次打印一次信息if (batch_idx + 1) % 100 == 0:batch_loss = loss.item()print(f'Epoch: {epoch+1}/{epochs} | Batch: {batch_idx+1}/{len(train_loader)} 'f'| Batch损失: {batch_loss:.4f} | 累计平均损失: {running_loss/(batch_idx+1):.4f}')global_step += 1# 更新学习率scheduler.step()# 计算当前epoch的平均训练损失和准确率epoch_train_loss = running_loss / len(train_loader)epoch_train_acc = 100. * correct / total# 记录每个epoch的训练指标writer.add_scalar('Train/Epoch_Loss', epoch_train_loss, epoch)writer.add_scalar('Train/Epoch_Accuracy', epoch_train_acc, epoch)# 测试阶段model.eval()test_loss = 0correct_test = 0total_test = 0# 限制错误样本数量max_wrong_samples = 100wrong_images = []wrong_labels = []wrong_preds = []with torch.no_grad():for data, target in test_loader:data, target = data.to(device), target.to(device)output = model(data)test_loss += criterion(output, target).item()_, predicted = output.max(1)total_test += target.size(0)correct_test += predicted.eq(target).sum().item()# 收集错误样本wrong_mask = (predicted != target).cpu()if wrong_mask.sum() > 0 and len(wrong_images) < max_wrong_samples:num_add = min(max_wrong_samples - len(wrong_images), wrong_mask.sum())wrong_batch_images = data[wrong_mask][:num_add].cpu()wrong_batch_labels = target[wrong_mask][:num_add].cpu()wrong_batch_preds = predicted[wrong_mask][:num_add].cpu()wrong_images.extend(wrong_batch_images)wrong_labels.extend(wrong_batch_labels)wrong_preds.extend(wrong_batch_preds)epoch_test_loss = test_loss / len(test_loader)epoch_test_acc = 100. * correct_test / total_test# 记录每个epoch的测试指标writer.add_scalar('Test/Loss', epoch_test_loss, epoch)writer.add_scalar('Test/Accuracy', epoch_test_acc, epoch)print(f'Epoch {epoch+1}/{epochs} 完成 | 训练准确率: {epoch_train_acc:.2f}% | 测试准确率: {epoch_test_acc:.2f}%')# 保存最佳模型if epoch_test_acc > best_accuracy:best_accuracy = epoch_test_acctorch.save(model.state_dict(), best_model_path)print(f"✅ 保存最佳模型(准确率: {best_accuracy:.2f}%)")# 可视化错误预测样本(每个epoch都记录,便于观察)if len(wrong_images) > 0:display_count = min(8, len(wrong_images))wrong_img_grid = torchvision.utils.make_grid(wrong_images[:display_count])wrong_text = []for i in range(display_count):true_label = classes[wrong_labels[i]]pred_label = classes[wrong_preds[i]]wrong_text.append(f'True: {true_label}, Pred: {pred_label}')writer.add_image('错误预测样本', wrong_img_grid, global_step=epoch)writer.add_text('错误预测标签', '\n'.join(wrong_text), global_step=epoch)model.train()  # 切回训练模式# 保存最终模型torch.save(model.state_dict(), final_model_path)print(f"模型已保存至 {final_model_path}")# 刷新并关闭TensorBoard写入器writer.flush()writer.close()return best_accuracy# Windows环境下的多进程兼容处理
if __name__ == "__main__":import torch.multiprocessing as mpmp.set_start_method('spawn')  # 关键修改:解决Windows多进程问题epochs = 20print("开始训练模型...")print(f"TensorBoard日志保存在: {log_dir}")print("训练完成后,使用命令 `tensorboard --logdir={log_dir}` 启动TensorBoard")final_accuracy = train(model, train_loader, test_loader, criterion, optimizer, scheduler, device, epochs, writer)print(f"训练完成!最佳测试准确率: {final_accuracy:.2f}%")

@浙大疏锦行

http://www.xdnf.cn/news/955423.html

相关文章:

  • JAVA实战开源项目:经方药食两用服务平台 (Vue+SpringBoot) 附源码
  • day030-Shell自动化编程-函数
  • Ubuntu Linux环境查看服务器资源,查询CPU,内存,环境变量等命令
  • 标注工具核心代码解析——load_image【canvas.py]
  • NXP S32K146 T-Box 携手 SD NAND(贴片式TF卡):驱动汽车智能革新的黄金组合
  • 深度解析:DDoS攻击及其防御策略
  • 【深度解析】以太坊中的挖矿难度调整机制:从动态调节到“冰河时代”的倒计时
  • FOPLP vs CoWoS
  • 二、ROS2完成Docker容器和宿主机通信,使用ros2 topic list看到,但是无法echo
  • 驭码 CodeRider 2.0 产品体验:智能研发的革新之旅
  • OceanBase 桌面版
  • 从零开始了解数据采集(二十八)——制造业数字孪生
  • Hive 存储格式深度解析:从 TextFile 到 ORC,如何选对数据存储方案?
  • 基于FPGA的PID算法学习———实现P比例控制算法
  • 协同过滤算法进行电影推荐
  • java教程笔记(十三)-synchronized和ReentrantLock
  • Hilt vs Dagger2:Android 依赖注入框架对比
  • 2025年css+html面试题
  • PH热榜 | 2025-06-07
  • 有限自动机到正规文法转换器v1.0
  • 土建施工员考试:建筑施工技术重点知识有哪些?
  • Java中方法调用参数传递机制的理解和示例验证
  • NFT模式:数字资产确权与链游经济系统构建
  • 什么是VR全景展示?VR全景展示的用途
  • 微信小程序云开发平台MySQL的连接方式
  • STA(Station)模式与AP(Access Point)模式
  • LeetCode - 260. 只出现一次的数字 III
  • 镜像里切换为普通用户
  • VBA高级应用30例应用4:利用键盘设置来阻止自动运行事件
  • 研读论文《Attention Is All You Need》(14)