当前位置：首页 > web >正文

# YOLOv3：基于 PyTorch 的目标检测模型实现

web 2025/5/14 8:13:41

YOLOv3：基于 PyTorch 的目标检测模型实现

引言

YOLOv3（You Only Look Once）是一种流行的单阶段目标检测算法，它能够直接在输入图像上预测边界框和类别概率。YOLOv3 的优势在于其高效性和准确性，使其在实时目标检测任务中表现出色。本文将详细介绍如何使用 PyTorch 实现 YOLOv3 模型，并提供完整的代码实现。

1. YOLOv3 简介

YOLOv3 是 YOLO 系列算法的第三个版本，它在前两个版本的基础上进行了改进，提高了检测的准确性和速度。YOLOv3 的主要特点包括：

单阶段检测：YOLOv3 直接在输入图像上预测边界框和类别概率，无需生成候选框。
多尺度检测：YOLOv3 使用三个不同尺度的特征图进行检测，能够检测不同大小的目标。
高效率：YOLOv3 的设计使其能够在实时应用中高效运行。

2. 环境准备

在开始实现之前，确保你已经安装了以下必要的依赖库：

pip install torch numpy matplotlib

3. 代码实现

3.1 导入必要的库

from __future__ import divisionimport torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as npfrom utils.parse_config import *  # 用于解析配置文件
from utils.utils import build_targets, to_cpu, non_max_suppression  # 用于目标构建和后处理import matplotlib.pyplot as plt
import matplotlib.patches as patches

3.2 构建模块列表

create_modules 函数根据配置文件构建网络层：

def create_modules(module_defs):"""Constructs module list of layer blocks from module configuration in module_defs"""hyperparams = module_defs.pop(0)  # 获取超参数output_filters = [int(hyperparams["channels"])]  # 输出特征图的个数，也是卷积核的个数module_list = nn.ModuleList()  # 用于存储网络层的 ModuleListfor module_i, module_def in enumerate(module_defs):modules = nn.Sequential()  # 用于线性堆叠网络层if module_def["type"] == "convolutional":# 获取卷积层的参数bn = int(module_def["batch_normalize"])filters = int(module_def["filters"])  # 卷积核的个数kernel_size = int(module_def["size"])pad = (kernel_size - 1) // 2# 添加卷积层modules.add_module(f"conv_{module_i}",  # 卷积层名称nn.Conv2d(in_channels=output_filters[-1],  # 输入特征图的数量out_channels=filters,  # 输出特征图的数量kernel_size=kernel_size,  # 卷积核的大小stride=int(module_def["stride"]),  # 卷积核滑动的步长padding=pad,  # 填充的层数bias=not bn,  # 是否添加偏置项),)if bn:# 添加批量归一化层modules.add_module(f"batch_norm_{module_i}", nn.BatchNorm2d(filters, momentum=0.9))if module_def["activation"] == "leaky":# 添加 LeakyReLU 激活函数modules.add_module(f"leaky_{module_i}", nn.LeakyReLU(0.1))elif module_def["type"] == "maxpool":# 获取最大池化层的参数kernel_size = int(module_def["size"])stride = int(module_def["stride"])if kernel_size == 2 and stride == 1:# 添加零填充层modules.add_module(f"_debug_padding_{module_i}", nn.ZeroPad2d((0, 1, 0, 1)))# 添加最大池化层maxpool = nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=int((kernel_size - 1) // 2))modules.add_module(f"maxpool_{module_i}", maxpool)elif module_def["type"] == "upsample":# 获取上采样层的参数upsample = Upsample(scale_factor=int(module_def["stride"]), mode="nearest")modules.add_module(f"upsample_{module_i}", upsample)elif module_def["type"] == "route":# 获取路由层的参数layers = [int(x) for x in module_def["layers"].split(",")]filters = sum([output_filters[1:][i] for i in layers])modules.add_module(f"route_{module_i}", EmptyLayer())  # 添加空层elif module_def["type"] == "shortcut":# 获取残差层的参数filters = output_filters[1:][int(module_def["from"])]modules.add_module(f"shortcut_{module_i}", EmptyLayer())  # 添加空层elif module_def["type"] == "yolo":# 获取 YOLO 层的参数anchor_idxs = [int(x) for x in module_def["mask"].split(",")]anchors = [int(x) for x in module_def["anchors"].split(",")]anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]anchors = [anchors[i] for i in anchor_idxs]num_classes = int(module_def["classes"])img_size = int(hyperparams["height"])# 定义检测层yolo_layer = YOLOLayer(anchors, num_classes, img_size)modules.add_module(f"yolo_{module_i}", yolo_layer)# 将当前模块添加到模块列表中module_list.append(modules)output_filters.append(filters)  # 保存每一层的卷积核个数return hyperparams, module_list

3.3 上采样层

Upsample 类实现上采样操作：

class Upsample(nn.Module):""" nn.Upsample is deprecated """def __init__(self, scale_factor, mode="nearest"):super(Upsample, self).__init__()self.scale_factor = scale_factor  # 上采样比例self.mode = mode  # 上采样模式def forward(self, x):# 使用 PyTorch 的 interpolate 函数进行上采样x = F.interpolate(x, scale_factor=self.scale_factor, mode=self.mode)return x

3.4 空层

EmptyLayer 类用于占位，例如在 route 和 shortcut 层中：

class EmptyLayer(nn.Module):"""Placeholder for 'route' and 'shortcut' layers"""def __init__(self):super(EmptyLayer, self).__init__()

3.5 YOLO 检测层

YOLOLayer 类负责预测边界框、置信度和类别概率：

class YOLOLayer(nn.Module):"""Detection layer"""def __init__(self, anchors, num_classes, img_dim=416):super(YOLOLayer, self).__init__()self.anchors = anchors  # 锚框self.num_anchors = len(anchors)  # 锚框数量self.num_classes = num_classes  # 类别数量self.ignore_thres = 0.5  # 忽略阈值self.mse_loss = nn.MSELoss()  # 均方误差损失self.bce_loss = nn.BCELoss()  # 二元交叉熵损失self.obj_scale = 1  # 有目标的损失权重self.noobj_scale = 100  # 无目标的损失权重self.metrics = {}  # 用于存储评估指标self.img_dim = img_dim  # 输入图像尺寸self.grid_size = 0  # 网格大小def compute_grid_offsets(self, grid_size, cuda=True):"""计算网格偏移量"""self.grid_size = grid_sizeg = self.grid_sizeFloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensorself.stride = self.img_dim / self.grid_size  # 每个网格的像素大小# 计算每个网格的偏移量self.grid_x = torch.arange(g).repeat(g, 1).view([1, 1, g, g]).type(FloatTensor)self.grid_y = torch.arange(g).repeat(g, 1).t().view([1, 1, g, g]).type(FloatTensor)self.scaled_anchors = FloatTensor([(a_w / self.stride, a_h / self.stride) for a_w, a_h in self.anchors])self.anchor_w = self.scaled_anchors[:, 0:1].view((1, self.num_anchors, 1, 1))self.anchor_h = self.scaled_anchors[:, 1:2].view((1, self.num_anchors, 1, 1))def forward(self, x, targets=None, img_dim=None):"""前向传播"""FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensorLongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensorByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensorself.img_dim = img_dimnum_samples = x.size(0)grid_size = x.size(2)# 重塑预测张量prediction = (x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size).permute(0, 1, 3, 4, 2).contiguous())# 提取预测结果x = torch.sigmoid(prediction[..., 0])  # 中心点 xy = torch.sigmoid(prediction[..., 1])  # 中心点 yw = prediction[..., 2]  # 宽度h = prediction[..., 3]  # 高度pred_conf = torch.sigmoid(prediction[..., 4])  # 置信度pred_cls = torch.sigmoid(prediction[..., 5:])  # 类别预测# 如果网格大小不匹配，重新计算偏移量if grid_size != self.grid_size:self.compute_grid_offsets(grid_size, cuda=x.is_cuda)# 添加偏移量并缩放锚框pred_boxes = FloatTensor(prediction[..., :4].shape)pred_boxes[..., 0] = x.data + self.grid_xpred_boxes[..., 1] = y.data + self.grid_ypred_boxes[..., 2] = torch.exp(w.data) * self.anchor_wpred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h# 拼接最终输出output = torch.cat((pred_boxes.view(num_samples, -1, 4) * self.stride,pred_conf.view(num_samples, -1, 1),pred_cls.view(num_samples, -1, self.num_classes),),-1,)if targets is None:return output, 0else:# 构建目标张量iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets(pred_boxes=pred_boxes,pred_cls=pred_cls,target=targets,anchors=self.scaled_anchors,ignore_thres=self.ignore_thres,)# 计算损失loss_x = self.mse_loss(x[obj_mask], tx[obj_mask])loss_y = self.mse_loss(y[obj_mask], ty[obj_mask])loss_w = self.mse_loss(w[obj_mask], tw[obj_mask])loss_h = self.mse_loss(h[obj_mask], th[obj_mask])loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask])loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask])loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobjloss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask])total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls# 计算评估指标cls_acc = 100 * class_mask[obj_mask].mean()conf_obj = pred_conf[obj_mask].mean()conf_noobj = pred_conf[noobj_mask].mean()conf50 = (pred_conf > 0.5).float()iou50 = (iou_scores > 0.5).float()iou75 = (iou_scores > 0.75).float()detected_mask = conf50 * class_mask * tconfprecision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16)recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16)recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16)self.metrics = {"loss": to_cpu(total_loss).item(),"x": to_cpu(loss_x).item(),"y": to_cpu(loss_y).item(),"w": to_cpu(loss_w).item(),"h": to_cpu(loss_h).item(),"conf": to_cpu(loss_conf).item(),"cls": to_cpu(loss_cls).item(),"cls_acc": to_cpu(cls_acc).item(),"recall50": to_cpu(recall50).item(),"recall75": to_cpu(recall75).item(),"precision": to_cpu(precision).item(),"conf_obj": to_cpu(conf_obj).item(),"conf_noobj": to_cpu(conf_noobj).item(),"grid_size": grid_size,}return output, total_loss

3.6 YOLOv3 模型

Darknet 类是 YOLOv3 模型的主体，负责加载配置文件、构建网络、前向传播、加载和保存权重：

class Darknet(nn.Module):"""YOLOv3 object detection model"""def __init__(self, config_path, img_size=416):super(Darknet, self).__init__()self.module_defs = parse_model_config(config_path)  # 解析配置文件self.hyperparams, self.module_list = create_modules(self.module_defs)  # 创建网络层self.yolo_layers = [layer[0] for layer in self.module_list if hasattr(layer[0], "metrics")]  # 提取 YOLO 层self.img_size = img_size  # 输入图像尺寸self.seen = 0  # 训练时看到的图像数量self.header_info = np.array([0, 0, 0, self.seen, 0], dtype=np.int32)  # 权重文件的头部信息def forward(self, x, targets=None):"""前向传播"""img_dim = x.shape[2]loss = 0layer_outputs, yolo_outputs = [], []for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):if module_def["type"] in ["convolutional", "upsample", "maxpool"]:x = module(x)elif module_def["type"] == "route":x = torch.cat([layer_outputs[int(layer_i)] for layer_i in module_def["layers"].split(",")], 1)elif module_def["type"] == "shortcut":layer_i = int(module_def["from"])x = layer_outputs[-1] + layer_outputs[layer_i]elif module_def["type"] == "yolo":x, layer_loss = module[0](x, targets, img_dim)loss += layer_lossyolo_outputs.append(x)layer_outputs.append(x)yolo_outputs = to_cpu(torch.cat(yolo_outputs, 1))return yolo_outputs if targets is None else (loss, yolo_outputs)def load_darknet_weights(self, weights_path):"""加载 Darknet 权重"""with open(weights_path, "rb") as f:header = np.fromfile(f, dtype=np.int32, count=5)  # 读取头部信息self.header_info = headerself.seen = header[3]weights = np.fromfile(f, dtype=np.float32)  # 读取权重cutoff = Noneif "darknet53.conv.74" in weights_path:cutoff = 75ptr = 0for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):if i == cutoff:breakif module_def["type"] == "convolutional":conv_layer = module[0]if module_def["batch_normalize"]:bn_layer = module[1]num_b = bn_layer.bias.numel()bn_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.bias)bn_layer.bias.data.copy_(bn_b)ptr += num_bbn_w = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.weight)bn_layer.weight.data.copy_(bn_w)ptr += num_bbn_rm = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_mean)bn_layer.running_mean.data.copy_(bn_rm)ptr += num_bbn_rv = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_var)bn_layer.running_var.data.copy_(bn_rv)ptr += num_belse:num_b = conv_layer.bias.numel()conv_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(conv_layer.bias)conv_layer.bias.data.copy_(conv_b)ptr += num_bnum_w = conv_layer.weight.numel()conv_w = torch.from_numpy(weights[ptr : ptr + num_w]).view_as(conv_layer.weight)conv_layer.weight.data.copy_(conv_w)ptr += num_wdef save_darknet_weights(self, path, cutoff=-1):"""保存 Darknet 权重"""fp = open(path, "wb")self.header_info[3] = self.seenself.header_info.tofile(fp)for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])):if module_def["type"] == "convolutional":conv_layer = module[0]if module_def["batch_normalize"]:bn_layer = module[1]bn_layer.bias.data.cpu().numpy().tofile(fp)bn_layer.weight.data.cpu().numpy().tofile(fp)bn_layer.running_mean.data.cpu().numpy().tofile(fp)bn_layer.running_var.data.cpu().numpy().tofile(fp)else:conv_layer.bias.data.cpu().numpy().tofile(fp)conv_layer.weight.data.cpu().numpy().tofile(fp)fp.close()

4. 使用示例

以下是一个简单的示例代码，用于加载 YOLOv3 模型并进行推理：

import torch
from models import *  # 确保你的模型定义在 models 模块中# 加载配置文件和权重
config_path = "path/to/your/yolov3.cfg"
weights_path = "path/to/your/yolov3.weights"
img_size = 416model = Darknet(config_path, img_size=img_size)
model.load_darknet_weights(weights_path)# 设置为评估模式
model.eval()# 加载输入图像
# 假设你有一个输入图像 tensor，尺寸为 (1, 3, 416, 416)
input_image = torch.randn(1, 3, img_size, img_size)# 前向传播
with torch.no_grad():detections = model(input_image)# 打印检测结果
print(detections)