......@@ -25,20 +25,20 @@ video.py里面测试的FPS会低于该FPS,因为摄像头的读取频率有限
def get_FPS(self, image, test_interval):
# 调整图片使其符合输入要求
image_shape = np.array(np.shape(image)[0:2])
# 给图像增加灰条,实现不失真的resize
crop_img = np.array(letterbox_image(image, (self.model_image_size[1],self.model_image_size[0])))
photo = np.array(crop_img,dtype = np.float32)
photo /= 255.0
photo = np.array(crop_img,dtype = np.float32) / 255.0
photo = np.transpose(photo, (2, 0, 1))
photo = photo.astype(np.float32)
images = []
images = np.asarray(images)
# 添加上batch_size维度
images = [photo]
with torch.no_grad():
images = torch.from_numpy(images)
images = torch.from_numpy(np.asarray(images))
if self.cuda:
images = images.cuda()
outputs = self.net(images)
# 验证集的划分在train.py代码里面进行
# test.txt和val.txt里面没有内容是正常的。训练不会使用到。
import os
import random
# 想要增加测试集修改trainval_percent
# train_percent不需要修改
# mAP所需文件计算代码
# 具体教程请查看Bilibili
# Bubbliiiing
import cv2
import numpy as np
# 获取测试集的detection-result和images-optional
# 具体视频教程可查看
# https://www.bilibili.com/video/BV1zE411u7Vw
import colorsys
import os
import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torch.nn as nn
from PIL import Image, ImageDraw, ImageFont
from torch.autograd import Variable
from yolo import YOLO
from tqdm import tqdm
from nets.yolo3 import YoloBody
from PIL import Image,ImageFont, ImageDraw
from utils.config import Config
from utils.utils import non_max_suppression, bbox_iou, DecodeBox,letterbox_image,yolo_correct_boxes
from tqdm import tqdm
from utils.utils import (DecodeBox, bbox_iou, letterbox_image,
non_max_suppression, yolo_correct_boxes)
from yolo import YOLO
class mAP_Yolo(YOLO):
......@@ -28,40 +32,61 @@ class mAP_Yolo(YOLO):
f = open("./input/detection-results/"+image_id+".txt","w")
image_shape = np.array(np.shape(image)[0:2])
# 给图像增加灰条,实现不失真的resize
crop_img = np.array(letterbox_image(image, (self.model_image_size[1],self.model_image_size[0])))
photo = np.array(crop_img,dtype = np.float32)
photo /= 255.0
photo = np.array(crop_img,dtype = np.float32) / 255.0
photo = np.transpose(photo, (2, 0, 1))
photo = photo.astype(np.float32)
images = []
images = np.asarray(images)
images = torch.from_numpy(images)
if self.cuda:
images = images.cuda()
# 添加上batch_size维度
images = [photo]
with torch.no_grad():
images = torch.from_numpy(np.asarray(images))
if self.cuda:
images = images.cuda()
# 将图像输入网络当中进行预测!
outputs = self.net(images)
output_list = []
for i in range(3):
# 将预测框进行堆叠,然后进行非极大抑制
output = torch.cat(output_list, 1)
batch_detections = non_max_suppression(output, self.config["yolo"]["classes"],
try :
batch_detections = batch_detections[0].cpu().numpy()
return image
top_index = batch_detections[:,4]*batch_detections[:,5] > self.confidence
top_conf = batch_detections[top_index,4]*batch_detections[top_index,5]
top_label = np.array(batch_detections[top_index,-1],np.int32)
top_bboxes = np.array(batch_detections[top_index,:4])
top_xmin, top_ymin, top_xmax, top_ymax = np.expand_dims(top_bboxes[:,0],-1),np.expand_dims(top_bboxes[:,1],-1),np.expand_dims(top_bboxes[:,2],-1),np.expand_dims(top_bboxes[:,3],-1)
# 去掉灰条
boxes = yolo_correct_boxes(top_ymin,top_xmin,top_ymax,top_xmax,np.array([self.model_image_size[0],self.model_image_size[1]]),image_shape)
# 如果没有检测出物体,返回原图
try :
batch_detections = batch_detections[0].cpu().numpy()
return image
# 对预测框进行得分筛选
top_index = batch_detections[:,4] * batch_detections[:,5] > self.confidence
top_conf = batch_detections[top_index,4]*batch_detections[top_index,5]
top_label = np.array(batch_detections[top_index,-1],np.int32)
top_bboxes = np.array(batch_detections[top_index,:4])
top_xmin, top_ymin, top_xmax, top_ymax = np.expand_dims(top_bboxes[:,0],-1),np.expand_dims(top_bboxes[:,1],-1),np.expand_dims(top_bboxes[:,2],-1),np.expand_dims(top_bboxes[:,3],-1)
# 在图像传入网络预测前会进行letterbox_image给图像周围添加灰条
# 因此生成的top_bboxes是相对于有灰条的图像的
# 我们需要对其进行修改,去除灰条的部分。
boxes = yolo_correct_boxes(top_ymin,top_xmin,top_ymax,top_xmax,np.array([self.model_image_size[0],self.model_image_size[1]]),image_shape)
for i, c in enumerate(top_label):
predicted_class = self.class_names[c]
import torch
import torch.nn as nn
import math
from collections import OrderedDict
# 基本的darknet块
import torch
import torch.nn as nn
# 残差结构
# 利用一个1x1卷积下降通道数,然后利用一个3x3卷积提取特征并且上升通道数
# 最后接上一个残差边
class BasicBlock(nn.Module):
def __init__(self, inplanes, planes):
super(BasicBlock, self).__init__()
......@@ -36,14 +42,20 @@ class DarkNet(nn.Module):
def __init__(self, layers):
super(DarkNet, self).__init__()
self.inplanes = 32
# 416,416,3 -> 416,416,32
self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(self.inplanes)
self.relu1 = nn.LeakyReLU(0.1)
# 416,416,32 -> 208,208,64
self.layer1 = self._make_layer([32, 64], layers[0])
# 208,208,64 -> 104,104,128
self.layer2 = self._make_layer([64, 128], layers[1])
# 104,104,128 -> 52,52,256
self.layer3 = self._make_layer([128, 256], layers[2])
# 52,52,256 -> 26,26,512
self.layer4 = self._make_layer([256, 512], layers[3])
# 26,26,512 -> 13,13,1024
self.layer5 = self._make_layer([512, 1024], layers[4])
self.layers_out_filters = [64, 128, 256, 512, 1024]
......@@ -57,6 +69,10 @@ class DarkNet(nn.Module):
# 在每一个layer里面,首先利用一个步长为2的3x3卷积进行下采样
# 然后进行残差结构的堆叠
def _make_layer(self, planes, blocks):
layers = []
# 下采样,步长为2,卷积核大小为3
......@@ -64,7 +80,7 @@ class DarkNet(nn.Module):
stride=2, padding=1, bias=False)))
layers.append(("ds_bn", nn.BatchNorm2d(planes[1])))
layers.append(("ds_relu", nn.LeakyReLU(0.1)))
# 加入darknet模块
# 加入残差结构
self.inplanes = planes[1]
for i in range(0, blocks):
layers.append(("residual_{}".format(i), BasicBlock(self.inplanes, planes)))
from collections import OrderedDict
import torch
import torch.nn as nn
from collections import OrderedDict
from nets.darknet import darknet53
def conv2d(filter_in, filter_out, kernel_size):
pad = (kernel_size - 1) // 2 if kernel_size else 0
return nn.Sequential(OrderedDict([
......@@ -11,6 +14,10 @@ def conv2d(filter_in, filter_out, kernel_size):
("relu", nn.LeakyReLU(0.1)),
# make_last_layers里面一共有七个卷积,前五个用于提取特征。
# 后两个用于获得yolo网络的预测结果
def make_last_layers(filters_list, in_filters, out_filter):
m = nn.ModuleList([
conv2d(in_filters, filters_list[0], 1),
......@@ -28,21 +35,30 @@ class YoloBody(nn.Module):
def __init__(self, config):
super(YoloBody, self).__init__()
self.config = config
# backbone
# 生成darknet53的主干模型
# 获得三个有效特征层,他们的shape分别是:
# 13,13,256
# 26,26,512
# 13,13,1024
self.backbone = darknet53(None)
# out_filters : [64, 128, 256, 512, 1024]
out_filters = self.backbone.layers_out_filters
# last_layer0
# 计算yolo_head的输出通道数,对于voc数据集而言
# final_out_filter0 = final_out_filter1 = final_out_filter2 = 75
final_out_filter0 = len(config["yolo"]["anchors"][0]) * (5 + config["yolo"]["classes"])
self.last_layer0 = make_last_layers([512, 1024], out_filters[-1], final_out_filter0)
# embedding1
final_out_filter1 = len(config["yolo"]["anchors"][1]) * (5 + config["yolo"]["classes"])
self.last_layer1_conv = conv2d(512, 256, 1)
self.last_layer1_upsample = nn.Upsample(scale_factor=2, mode='nearest')
self.last_layer1 = make_last_layers([256, 512], out_filters[-2] + 256, final_out_filter1)
# embedding2
final_out_filter2 = len(config["yolo"]["anchors"][2]) * (5 + config["yolo"]["classes"])
self.last_layer2_conv = conv2d(256, 128, 1)
self.last_layer2_upsample = nn.Upsample(scale_factor=2, mode='nearest')
......@@ -56,21 +72,43 @@ class YoloBody(nn.Module):
if i == 4:
out_branch = layer_in
return layer_in, out_branch
# backbone
# 获得三个有效特征层,他们的shape分别是:
# 13,13,256;26,26,512;13,13,1024
x2, x1, x0 = self.backbone(x)
# yolo branch 0
# 第一个特征层
# out0 = (batch_size,255,13,13)
# 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512
out0, out0_branch = _branch(self.last_layer0, x0)
# yolo branch 1
# 13,13,512 -> 13,13,256 -> 26,26,256
x1_in = self.last_layer1_conv(out0_branch)
x1_in = self.last_layer1_upsample(x1_in)
# 26,26,256 + 26,26,512 -> 26,26,768
x1_in = torch.cat([x1_in, x1], 1)
# 第二个特征层
# out1 = (batch_size,255,26,26)
# 26,26,768 -> 26,26,256 -> 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256
out1, out1_branch = _branch(self.last_layer1, x1_in)
# yolo branch 2
# 26,26,256 -> 26,26,128 -> 52,52,128
x2_in = self.last_layer2_conv(out1_branch)
x2_in = self.last_layer2_upsample(x2_in)
# 52,52,128 + 52,52,256 -> 52,52,384
x2_in = torch.cat([x2_in, x2], 1)
# 第一个特征层
# out3 = (batch_size,255,52,52)
# 52,52,384 -> 52,52,128 -> 52,52,256 -> 52,52,128 -> 52,52,256 -> 52,52,128
out2, _ = _branch(self.last_layer2, x2_in)
return out0, out1, out2
# 对单张图片进行预测
from yolo import YOLO
from PIL import Image
from yolo import YOLO
yolo = YOLO()
while True:
......@@ -5,6 +5,7 @@
import torch
from torchsummary import summary
from nets.yolo3 import YoloBody
from utils.config import Config
......@@ -2,21 +2,24 @@
# 对数据集进行训练
import os
import numpy as np
import time
import numpy as np
import torch
from torch.autograd import Variable
import torch.backends.cudnn as cudnn
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
from utils.config import Config
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader
from utils.dataloader import yolo_dataset_collate, YoloDataset
from nets.yolo_training import YOLOLoss,Generator
from nets.yolo3 import YoloBody
from tqdm import tqdm
from nets.yolo3 import YoloBody
from nets.yolo_training import Generator, YOLOLoss
from utils.config import Config
from utils.dataloader import YoloDataset, yolo_dataset_collate
def get_lr(optimizer):
for param_group in optimizer.param_groups:
return param_group['lr']
......@@ -24,7 +27,8 @@ def get_lr(optimizer):
def fit_ont_epoch(net,yolo_losses,epoch,epoch_size,epoch_size_val,gen,genval,Epoch,cuda):
total_loss = 0
val_loss = 0
start_time = time.time()
with tqdm(total=epoch_size,desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3) as pbar:
for iteration, batch in enumerate(gen):
if iteration >= epoch_size:
......@@ -37,25 +41,38 @@ def fit_ont_epoch(net,yolo_losses,epoch,epoch_size,epoch_size_val,gen,genval,Epo
images = Variable(torch.from_numpy(images).type(torch.FloatTensor))
targets = [Variable(torch.from_numpy(ann).type(torch.FloatTensor)) for ann in targets]
# 清零梯度
# 前向传播
outputs = net(images)
losses = []
num_pos_all = 0
# 计算损失
for i in range(3):
loss_item = yolo_losses[i](outputs[i], targets)
loss = sum(losses)
loss_item, num_pos = yolo_losses[i](outputs[i], targets)
num_pos_all += num_pos
loss = sum(losses) / num_pos
# 反向传播
total_loss += loss
waste_time = time.time() - start_time
total_loss += loss.item()
pbar.set_postfix(**{'total_loss': total_loss.item() / (iteration + 1),
'lr' : get_lr(optimizer),
'step/s' : waste_time})
pbar.set_postfix(**{'total_loss': total_loss / (iteration + 1),
'lr' : get_lr(optimizer)})
start_time = time.time()
print('Start Validation')
with tqdm(total=epoch_size_val, desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3) as pbar:
......@@ -74,14 +91,15 @@ def fit_ont_epoch(net,yolo_losses,epoch,epoch_size,epoch_size_val,gen,genval,Epo
outputs = net(images_val)
losses = []
num_pos_all = 0
for i in range(3):
loss_item = yolo_losses[i](outputs[i], targets_val)
loss = sum(losses)
val_loss += loss
pbar.set_postfix(**{'total_loss': val_loss.item() / (iteration + 1)})
loss_item, num_pos = yolo_losses[i](outputs[i], targets_val)
num_pos_all += num_pos
loss = sum(losses) / num_pos
val_loss += loss.item()
pbar.set_postfix(**{'total_loss': val_loss / (iteration + 1)})
print('Finish Validation')
print('Epoch:'+ str(epoch+1) + '/' + str(Epoch))
print('Total Loss: %.4f || Val Loss: %.4f ' % (total_loss/(epoch_size+1),val_loss/(epoch_size_val+1)))
......@@ -94,22 +112,33 @@ def fit_ont_epoch(net,yolo_losses,epoch,epoch_size,epoch_size_val,gen,genval,Epo
# https://www.bilibili.com/video/BV1zE411u7Vw
if __name__ == "__main__":
# 参数初始化
annotation_path = '2007_train.txt'
model = YoloBody(Config)
# 是否使用Cuda
# 没有GPU可以设置成False
Cuda = True
# Dataloder的使用
Use_Data_Loader = True
# 是否对损失进行归一化
normalize = True
# 创建yolo模型
# 训练前一定要修改Config里面的classes参数
model = YoloBody(Config)
# 权值文件的下载请看README
# 权值文件请看README,百度网盘下载
model_path = "model_data/yolo_weights.pth"
print('Loading weights into state dict...')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_dict = model.state_dict()
pretrained_dict = torch.load("model_data/yolo_weights.pth", map_location=device)
pretrained_dict = torch.load(model_path, map_location=device)
pretrained_dict = {k: v for k, v in pretrained_dict.items() if np.shape(model_dict[k]) == np.shape(v)}
......@@ -126,9 +155,17 @@ if __name__ == "__main__":
yolo_losses = []
for i in range(3):
Config["yolo"]["classes"], (Config["img_w"], Config["img_h"]), Cuda))
Config["yolo"]["classes"], (Config["img_w"], Config["img_h"]), Cuda, normalize))
# 0.1用于验证,0.9用于训练
# 获得图片路径和标签
annotation_path = '2007_train.txt'
# 验证集的划分在train.py代码里面进行
# 2007_test.txt和2007_val.txt里面没有内容是正常的。训练不会使用到。
# 当前划分方式下,验证集和训练集的比例为1:9
val_split = 0.1
with open(annotation_path) as f:
lines = f.readlines()
......@@ -138,17 +175,15 @@ if __name__ == "__main__":
num_val = int(len(lines)*val_split)
num_train = len(lines) - num_val
# 主干特征提取网络特征通用,冻结训练可以加快训练速度
# 也可以在训练初期防止权值被破坏。
# Init_Epoch为起始世代
# Freeze_Epoch为冻结训练的世代
# Epoch总训练世代
# Unfreeze_Epoch总训练世代
# 提示OOM或者显存不足请调小Batch_size
if True:
# 最开始使用1e-3的学习率可以收敛的更快
lr = 1e-3
Batch_size = 8
Init_Epoch = 0
......@@ -158,17 +193,17 @@ if __name__ == "__main__":
lr_scheduler = optim.lr_scheduler.StepLR(optimizer,step_size=1,gamma=0.95)
if Use_Data_Loader:
train_dataset = YoloDataset(lines[:num_train], (Config["img_h"], Config["img_w"]))
val_dataset = YoloDataset(lines[num_train:], (Config["img_h"], Config["img_w"]))
train_dataset = YoloDataset(lines[:num_train], (Config["img_h"], Config["img_w"]), True)
val_dataset = YoloDataset(lines[num_train:], (Config["img_h"], Config["img_w"]), False)
gen = DataLoader(train_dataset, shuffle=True, batch_size=Batch_size, num_workers=4, pin_memory=True,
drop_last=True, collate_fn=yolo_dataset_collate)
gen_val = DataLoader(val_dataset, shuffle=True, batch_size=Batch_size, num_workers=4,pin_memory=True,
drop_last=True, collate_fn=yolo_dataset_collate)
gen = Generator(Batch_size, lines[:num_train],
(Config["img_h"], Config["img_w"])).generate()
(Config["img_h"], Config["img_w"])).generate(True)
gen_val = Generator(Batch_size, lines[num_train:],
(Config["img_h"], Config["img_w"])).generate()
(Config["img_h"], Config["img_w"])).generate(False)
epoch_size = num_train//Batch_size
epoch_size_val = num_val//Batch_size
......@@ -190,18 +225,19 @@ if __name__ == "__main__":
optimizer = optim.Adam(net.parameters(),lr)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer,step_size=1,gamma=0.95)
if Use_Data_Loader:
train_dataset = YoloDataset(lines[:num_train], (Config["img_h"], Config["img_w"]))
val_dataset = YoloDataset(lines[num_train:], (Config["img_h"], Config["img_w"]))
train_dataset = YoloDataset(lines[:num_train], (Config["img_h"], Config["img_w"]), True)
val_dataset = YoloDataset(lines[num_train:], (Config["img_h"], Config["img_w"]), False)
gen = DataLoader(train_dataset, shuffle=True, batch_size=Batch_size, num_workers=4, pin_memory=True,
drop_last=True, collate_fn=yolo_dataset_collate)
gen_val = DataLoader(val_dataset, shuffle=True, batch_size=Batch_size, num_workers=4,pin_memory=True,
drop_last=True, collate_fn=yolo_dataset_collate)
gen = Generator(Batch_size, lines[:num_train],
(Config["img_h"], Config["img_w"])).generate()
(Config["img_h"], Config["img_w"])).generate(True)
gen_val = Generator(Batch_size, lines[num_train:],
(Config["img_h"], Config["img_w"])).generate()
(Config["img_h"], Config["img_w"])).generate(False)
epoch_size = num_train//Batch_size
epoch_size_val = num_val//Batch_size
Config = \
# 训练前一定要修改classes参数
# anchors可以不修改,因为anchors的通用性较大
# 而且大中小的设置非常符合yolo的特征层情况
"yolo": {
"anchors": [[[116, 90], [156, 198], [373, 326]],
[[30, 61], [62, 45], [59, 119]],
[[10, 13], [16, 30], [33, 23]]],
"classes": 20,
# img_h和img_w可以修改成608x608
"img_h": 416,
"img_w": 416,
......@@ -13,12 +13,13 @@ from nets.yolo_training import Generator
import cv2
class YoloDataset(Dataset):
def __init__(self, train_lines, image_size):
def __init__(self, train_lines, image_size, is_train):
super(YoloDataset, self).__init__()
self.train_lines = train_lines
self.train_batches = len(train_lines)
self.image_size = image_size
self.is_train = is_train
def __len__(self):
return self.train_batches
......@@ -26,7 +27,7 @@ class YoloDataset(Dataset):
def rand(self, a=0, b=1):
return np.random.rand() * (b - a) + a
def get_random_data(self, annotation_line, input_shape, jitter=.3, hue=.1, sat=1.5, val=1.5):
def get_random_data(self, annotation_line, input_shape, jitter=.3, hue=.1, sat=1.5, val=1.5, random=True):
line = annotation_line.split()
image = Image.open(line[0])
......@@ -34,6 +35,35 @@ class YoloDataset(Dataset):
h, w = input_shape
box = np.array([np.array(list(map(int, box.split(',')))) for box in line[1:]])
if not random:
scale = min(w/iw, h/ih)
nw = int(iw*scale)
nh = int(ih*scale)
dx = (w-nw)//2
dy = (h-nh)//2
image = image.resize((nw,nh), Image.BICUBIC)
new_image = Image.new('RGB', (w,h), (128,128,128))
new_image.paste(image, (dx, dy))
image_data = np.array(new_image, np.float32)
# 调整目标框坐标
box_data = np.zeros((len(box), 5))
if len(box) > 0:
box[:, [0, 2]] = box[:, [0, 2]] * nw / iw + dx
box[:, [1, 3]] = box[:, [1, 3]] * nh / ih + dy
box[:, 0:2][box[:, 0:2] < 0] = 0
box[:, 2][box[:, 2] > w] = w
box[:, 3][box[:, 3] > h] = h
box_w = box[:, 2] - box[:, 0]
box_h = box[:, 3] - box[:, 1]
box = box[np.logical_and(box_w > 1, box_h > 1)] # 保留有效框
box_data = np.zeros((len(box), 5))
box_data[:len(box)] = box
return image_data, box_data
# 调整图片大小
new_ar = w / h * self.rand(1 - jitter, 1 + jitter) / self.rand(1 - jitter, 1 + jitter)
scale = self.rand(.25, 2)
......@@ -48,8 +78,7 @@ class YoloDataset(Dataset):
# 放置图片
dx = int(self.rand(0, w - nw))
dy = int(self.rand(0, h - nh))
new_image = Image.new('RGB', (w, h),
(np.random.randint(0, 255), np.random.randint(0, 255), np.random.randint(0, 255)))
new_image = Image.new('RGB', (w, h), (128, 128, 128))
new_image.paste(image, (dx, dy))
image = new_image
......@@ -89,19 +118,18 @@ class YoloDataset(Dataset):
box = box[np.logical_and(box_w > 1, box_h > 1)] # 保留有效框
box_data = np.zeros((len(box), 5))
box_data[:len(box)] = box
if len(box) == 0:
return image_data, []
if (box_data[:, :4] > 0).any():
return image_data, box_data
return image_data, []
return image_data, box_data
def __getitem__(self, index):
lines = self.train_lines
n = self.train_batches
index = index % n
img, y = self.get_random_data(lines[index], self.image_size[0:2])
if self.is_train:
img, y = self.get_random_data(lines[index], self.image_size[0:2])
img, y = self.get_random_data(lines[index], self.image_size[0:2], False)
if len(y) != 0:
# 从坐标转换成0~1的百分比
boxes = np.array(y[:, :4], dtype=np.float32)
from __future__ import division
import os
import math
import os
import time
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from torch.autograd import Variable
from torchvision.ops import nms
from PIL import Image, ImageDraw, ImageFont
class DecodeBox(nn.Module):
def __init__(self, anchors, num_classes, img_size):
super(DecodeBox, self).__init__()
# 13x13的特征层对应的anchor是[116,90],[156,198],[373,326]
# 26x26的特征层对应的anchor是[30,61],[62,45],[59,119]
# 52x52的特征层对应的anchor是[10,13],[16,30],[33,23]
self.anchors = anchors
self.num_anchors = len(anchors)
self.num_classes = num_classes
......@@ -20,17 +28,33 @@ class DecodeBox(nn.Module):
self.img_size = img_size
def forward(self, input):
# 输入的input一共有三个,他们的shape分别是
# batch_size, 255, 13, 13
# batch_size, 255, 26, 26
# batch_size, 255, 52, 52
batch_size = input.size(0)
input_height = input.size(2)
input_width = input.size(3)
# 计算步长
# 输入为416x416时
# stride_h = stride_w = 32、16、8
stride_h = self.img_size[1] / input_height
stride_w = self.img_size[0] / input_width
# 归一到特征层上
# 此时获得的scaled_anchors大小是相对于特征层的
scaled_anchors = [(anchor_width / stride_w, anchor_height / stride_h) for anchor_width, anchor_height in self.anchors]
# 对预测结果进行resize
# 输入的input一共有三个,他们的shape分别是
# batch_size, 3, 13, 13, 85
# batch_size, 3, 26, 26, 85
# batch_size, 3, 52, 52, 85
prediction = input.view(batch_size, self.num_anchors,
self.bbox_attrs, input_height, input_width).permute(0, 1, 3, 4, 2).contiguous()
......@@ -38,37 +62,48 @@ class DecodeBox(nn.Module):
x = torch.sigmoid(prediction[..., 0])
y = torch.sigmoid(prediction[..., 1])
# 先验框的宽高调整参数
w = prediction[..., 2] # Width
h = prediction[..., 3] # Height
w = prediction[..., 2]
h = prediction[..., 3]
# 获得置信度,是否有物体
conf = torch.sigmoid(prediction[..., 4])
# 种类置信度
pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred.
pred_cls = torch.sigmoid(prediction[..., 5:])
FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
# 生成网格,先验框中心,网格左上角 batch_size,3,13,13
# 生成网格,先验框中心,网格左上角
# batch_size,3,13,13
grid_x = torch.linspace(0, input_width - 1, input_width).repeat(input_height, 1).repeat(
batch_size * self.num_anchors, 1, 1).view(x.shape).type(FloatTensor)
grid_y = torch.linspace(0, input_height - 1, input_height).repeat(input_width, 1).t().repeat(
batch_size * self.num_anchors, 1, 1).view(y.shape).type(FloatTensor)
# 生成先验框的宽高
# 按照网格格式生成先验框的宽高
# batch_size,3,13,13
anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0]))
anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1]))
anchor_w = anchor_w.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(w.shape)
anchor_h = anchor_h.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(h.shape)
# 计算调整后的先验框中心与宽高
# 利用预测结果对先验框进行调整
# 首先调整先验框的中心,从先验框中心向右下角偏移
# 再调整先验框的宽高。
pred_boxes = FloatTensor(prediction[..., :4].shape)
pred_boxes[..., 0] = x.data + grid_x
pred_boxes[..., 1] = y.data + grid_y
pred_boxes[..., 2] = torch.exp(w.data) * anchor_w
pred_boxes[..., 3] = torch.exp(h.data) * anchor_h
# 用于将输出调整为相对于416x416的大小
# 将输出结果调整成相对于输入图像大小
_scale = torch.Tensor([stride_w, stride_h] * 2).type(FloatTensor)
output = torch.cat((pred_boxes.view(batch_size, -1, 4) * _scale,
conf.view(batch_size, -1, 1), pred_cls.view(batch_size, -1, self.num_classes)), -1)
......@@ -139,7 +174,10 @@ def bbox_iou(box1, box2, x1y1x2y2=True):
def non_max_suppression(prediction, num_classes, conf_thres=0.5, nms_thres=0.4):
# 求左上角和右下角
# 将预测结果的格式转换成左上角右下角的格式。
# prediction [batch_size, num_anchors, 85]
box_corner = prediction.new(prediction.shape)
box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
......@@ -149,21 +187,35 @@ def non_max_suppression(prediction, num_classes, conf_thres=0.5, nms_thres=0.4):
output = [None for _ in range(len(prediction))]
for image_i, image_pred in enumerate(prediction):
# 获得种类及其置信度
# 对种类预测部分取max。
# class_conf [batch_size, num_anchors, 1] 种类置信度
# class_pred [batch_size, num_anchors, 1] 种类
class_conf, class_pred = torch.max(image_pred[:, 5:5 + num_classes], 1, keepdim=True)
# 利用置信度进行第一轮筛选
conf_mask = (image_pred[:, 4]*class_conf[:, 0] >= conf_thres).squeeze()
# 利用置信度进行第一轮筛选
conf_mask = (image_pred[:, 4] * class_conf[:, 0] >= conf_thres).squeeze()
# 根据置信度进行预测结果的筛选
image_pred = image_pred[conf_mask]
class_conf = class_conf[conf_mask]
class_pred = class_pred[conf_mask]
if not image_pred.size(0):
# 获得的内容为(x1, y1, x2, y2, obj_conf, class_conf, class_pred)
# detections [batch_size, num_anchors, 7]
# 7的内容为:x1, y1, x2, y2, obj_conf, class_conf, class_pred
detections = torch.cat((image_pred[:, :5], class_conf.float(), class_pred.float()), 1)
# 获得种类
# 获得预测结果中包含的所有种类
unique_labels = detections[:, -1].cpu().unique()
if prediction.is_cuda:
......@@ -171,7 +223,9 @@ def non_max_suppression(prediction, num_classes, conf_thres=0.5, nms_thres=0.4):
detections = detections.cuda()
for c in unique_labels:
# 获得某一类初步筛选后全部的预测结果
# 获得某一类得分筛选后全部的预测结果
detections_class = detections[detections[:, -1] == c]
......@@ -179,7 +233,7 @@ def non_max_suppression(prediction, num_classes, conf_thres=0.5, nms_thres=0.4):
keep = nms(
detections_class[:, :4],
detections_class[:, 4]*detections_class[:, 5],
detections_class[:, 4] * detections_class[:, 5],
max_detections = detections_class[keep]
# 调用摄像头检测
# 调用摄像头或者视频进行检测
# 调用摄像头直接运行即可
# 调用视频可以将cv2.VideoCapture()指定路径
# 视频的保存并不难,可以百度一下看看
from yolo import YOLO
from PIL import Image
import numpy as np
import cv2
import time
yolo = YOLO()
# 调用摄像头
capture=cv2.VideoCapture(0) # capture=cv2.VideoCapture("1.mp4")
import cv2
import numpy as np
from PIL import Image
from yolo import YOLO
yolo = YOLO()
# 调用摄像头
# capture=cv2.VideoCapture("1.mp4")
fps = 0.0
t1 = time.time()
......@@ -19,10 +27,8 @@ while(True):
frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
# 转变成Image
frame = Image.fromarray(np.uint8(frame))
# 进行检测
frame = np.array(yolo.detect_image(frame))
# RGBtoBGR满足opencv显示格式
frame = cv2.cvtColor(frame,cv2.COLOR_RGB2BGR)
......@@ -32,7 +38,6 @@ while(True):
c= cv2.waitKey(1) & 0xff
if c==27:
# 运行前一定要修改classes
# 如果生成的2007_train.txt里面没有目标信息
# 那么就是因为classes没有设定正确
import xml.etree.ElementTree as ET
from os import getcwd
# 创建YOLO类
import cv2
import numpy as np
import colorsys
import os
import cv2
import numpy as np
import torch
import torch.nn as nn
from nets.yolo3 import YoloBody
import torch.backends.cudnn as cudnn
from PIL import Image,ImageFont, ImageDraw
import torch.nn as nn
from PIL import Image, ImageDraw, ImageFont
from torch.autograd import Variable
from nets.yolo3 import YoloBody
from utils.config import Config
from utils.utils import non_max_suppression, bbox_iou, DecodeBox,letterbox_image,yolo_correct_boxes
from utils.utils import (DecodeBox, bbox_iou, letterbox_image,
non_max_suppression, yolo_correct_boxes)
# 使用自己训练好的模型预测需要修改2个参数
# model_path和classes_path都需要修改!
# 如果出现shape不匹配,一定要注意
# 训练时的model_path和classes_path参数的修改
class YOLO(object):
_defaults = {
......@@ -52,14 +58,20 @@ class YOLO(object):
class_names = f.readlines()
class_names = [c.strip() for c in class_names]
return class_names
# 获得所有的分类
# 生成模型
def generate(self):
self.config["yolo"]["classes"] = len(self.class_names)
# 建立yolov3模型
self.net = YoloBody(self.config)
# 加快模型训练的效率
# 载入yolov3模型的权重
print('Loading weights into state dict...')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
state_dict = torch.load(self.model_path, map_location=device)
......@@ -71,10 +83,12 @@ class YOLO(object):
self.net = nn.DataParallel(self.net)
self.net = self.net.cuda()
# 建立三个特征层解码用的工具
self.yolo_decodes = []
for i in range(3):
self.yolo_decodes.append(DecodeBox(self.config["yolo"]["anchors"][i], self.config["yolo"]["classes"], (self.model_image_size[1], self.model_image_size[0])))
self.yolo_decodes.append(DecodeBox(self.config["yolo"]["anchors"][i], self.config["yolo"]["classes"], (self.model_image_size[1], self.model_image_size[0])))
print('{} model, anchors, and classes loaded.'.format(self.model_path))
# 画框设置不同的颜色
......@@ -91,44 +105,65 @@ class YOLO(object):
def detect_image(self, image):
image_shape = np.array(np.shape(image)[0:2])
# 给图像增加灰条,实现不失真的resize
crop_img = np.array(letterbox_image(image, (self.model_image_size[1],self.model_image_size[0])))
photo = np.array(crop_img,dtype = np.float32)
photo /= 255.0
photo = np.array(crop_img,dtype = np.float32) / 255.0
photo = np.transpose(photo, (2, 0, 1))
photo = photo.astype(np.float32)
images = []
# 添加上batch_size维度
images = [photo]
images = np.asarray(images)
images = torch.from_numpy(images)
if self.cuda:
images = images.cuda()
with torch.no_grad():
images = torch.from_numpy(np.asarray(images))
if self.cuda:
images = images.cuda()
# 将图像输入网络当中进行预测!
outputs = self.net(images)
output_list = []
for i in range(3):
# 将预测框进行堆叠,然后进行非极大抑制
output = torch.cat(output_list, 1)
batch_detections = non_max_suppression(output, self.config["yolo"]["classes"],
try :
batch_detections = batch_detections[0].cpu().numpy()
return image
top_index = batch_detections[:,4]*batch_detections[:,5] > self.confidence
top_conf = batch_detections[top_index,4]*batch_detections[top_index,5]
top_label = np.array(batch_detections[top_index,-1],np.int32)
top_bboxes = np.array(batch_detections[top_index,:4])
top_xmin, top_ymin, top_xmax, top_ymax = np.expand_dims(top_bboxes[:,0],-1),np.expand_dims(top_bboxes[:,1],-1),np.expand_dims(top_bboxes[:,2],-1),np.expand_dims(top_bboxes[:,3],-1)
# 去掉灰条
boxes = yolo_correct_boxes(top_ymin,top_xmin,top_ymax,top_xmax,np.array([self.model_image_size[0],self.model_image_size[1]]),image_shape)
# 如果没有检测出物体,返回原图
try :
batch_detections = batch_detections[0].cpu().numpy()
return image
# 对预测框进行得分筛选
top_index = batch_detections[:,4] * batch_detections[:,5] > self.confidence
top_conf = batch_detections[top_index,4]*batch_detections[top_index,5]
top_label = np.array(batch_detections[top_index,-1],np.int32)
top_bboxes = np.array(batch_detections[top_index,:4])
top_xmin, top_ymin, top_xmax, top_ymax = np.expand_dims(top_bboxes[:,0],-1),np.expand_dims(top_bboxes[:,1],-1),np.expand_dims(top_bboxes[:,2],-1),np.expand_dims(top_bboxes[:,3],-1)
# 在图像传入网络预测前会进行letterbox_image给图像周围添加灰条
# 因此生成的top_bboxes是相对于有灰条的图像的
# 我们需要对其进行修改,去除灰条的部分。
boxes = yolo_correct_boxes(top_ymin,top_xmin,top_ymax,top_xmax,np.array([self.model_image_size[0],self.model_image_size[1]]),image_shape)
font = ImageFont.truetype(font='model_data/simhei.ttf',size=np.floor(3e-2 * np.shape(image)[1] + 0.5).astype('int32'))
thickness = (np.shape(image)[0] + np.shape(image)[1]) // self.model_image_size[0]
thickness = max((np.shape(image)[0] + np.shape(image)[1]) // self.model_image_size[0], 1)
for i, c in enumerate(top_label):
predicted_class = self.class_names[c]
......@@ -150,7 +185,7 @@ class YOLO(object):
draw = ImageDraw.Draw(image)
label_size = draw.textsize(label, font)
label = label.encode('utf-8')
print(label, top, left, bottom, right)
if top - label_size[1] >= 0:
text_origin = np.array([left, top - label_size[1]])
