提交 db7cfc1c 编写于 作者: Y Yang Zhang

Add `yolov3` demo

上级 3e8128cf
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import division
import argparse
import contextlib
import os
import random
import time
import cv2
import numpy as np
from pycocotools.coco import COCO
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph.nn import Conv2D
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.regularizer import L2Decay
from model import Model, Loss, shape_hints
from resnet import ResNet, ConvBNLayer
# XXX transfer learning
class ResNetBackBone(ResNet):
def __init__(self, depth=50):
super(ResNetBackBone, self).__init__(depth=depth)
delattr(self, 'fc')
def forward(self, inputs):
x = self.conv(inputs)
x = self.pool(x)
outputs = []
for layer in self.layers:
x = layer(x)
outputs.append(x)
return outputs
class YoloDetectionBlock(fluid.dygraph.Layer):
def __init__(self, num_channels, num_filters):
super(YoloDetectionBlock, self).__init__()
assert num_filters % 2 == 0, \
"num_filters {} cannot be divided by 2".format(num_filters)
self.conv0 = ConvBNLayer(
num_channels=num_channels,
num_filters=num_filters,
filter_size=1,
act='leaky_relu')
self.conv1 = ConvBNLayer(
num_channels=num_filters,
num_filters=num_filters * 2,
filter_size=3,
act='leaky_relu')
self.conv2 = ConvBNLayer(
num_channels=num_filters * 2,
num_filters=num_filters,
filter_size=1,
act='leaky_relu')
self.conv3 = ConvBNLayer(
num_channels=num_filters,
num_filters=num_filters * 2,
filter_size=3,
act='leaky_relu')
self.route = ConvBNLayer(
num_channels=num_filters * 2,
num_filters=num_filters,
filter_size=1,
act='leaky_relu')
self.tip = ConvBNLayer(
num_channels=num_filters,
num_filters=num_filters * 2,
filter_size=3,
act='leaky_relu')
def forward(self, inputs):
out = self.conv0(inputs)
out = self.conv1(out)
out = self.conv2(out)
out = self.conv3(out)
route = self.route(out)
tip = self.tip(route)
return route, tip
class YOLOv3(Model):
def __init__(self):
super(YOLOv3, self).__init__()
self.num_classes = 80
self.anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45,
59, 119, 116, 90, 156, 198, 373, 326]
self.anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
self.valid_thresh = 0.005
self.nms_topk = 400
self.nms_posk = 100
self.draw_thresh = 0.5
self.backbone = ResNetBackBone()
self.block_outputs = []
self.yolo_blocks = []
self.route_blocks = []
for idx, num_chan in enumerate([2048, 1280, 640]):
yolo_block = self.add_sublayer(
"detecton_block_{}".format(idx),
YoloDetectionBlock(num_chan, num_filters=512 // (2**idx)))
self.yolo_blocks.append(yolo_block)
num_filters = len(self.anchor_masks[idx]) * (self.num_classes + 5)
block_out = self.add_sublayer(
"block_out_{}".format(idx),
Conv2D(num_channels=1024 // (2**idx),
num_filters=num_filters,
filter_size=1,
param_attr=ParamAttr(
initializer=fluid.initializer.Normal(0., 0.02)),
bias_attr=ParamAttr(
initializer=fluid.initializer.Constant(0.0),
regularizer=L2Decay(0.))))
self.block_outputs.append(block_out)
if idx < 2:
route = self.add_sublayer(
"route_{}".format(idx),
ConvBNLayer(num_channels=512 // (2**idx),
num_filters=256 // (2**idx),
filter_size=1,
act='leaky_relu'))
self.route_blocks.append(route)
@shape_hints(inputs=[None, 3, None, None])
def forward(self, inputs, im_shape):
outputs = []
boxes = []
scores = []
downsample = 32
feats = self.backbone(inputs)
feats = feats[::-1][:len(self.anchor_masks)]
route = None
for idx, feat in enumerate(feats):
if idx > 0:
feat = fluid.layers.concat(input=[route, feat], axis=1)
route, tip = self.yolo_blocks[idx](feat)
block_out = self.block_outputs[idx](tip)
if idx < 2:
route = self.route_blocks[idx](route)
route = fluid.layers.resize_nearest(route, scale=2)
anchor_mask = self.anchor_masks[idx]
mask_anchors = []
for m in anchor_mask:
mask_anchors.append(self.anchors[2 * m])
mask_anchors.append(self.anchors[2 * m + 1])
b, s = fluid.layers.yolo_box(
x=block_out,
img_size=im_shape,
anchors=mask_anchors,
class_num=self.num_classes,
conf_thresh=self.valid_thresh,
downsample_ratio=downsample)
outputs.append(block_out)
boxes.append(b)
scores.append(fluid.layers.transpose(s, perm=[0, 2, 1]))
downsample //= 2
if self.mode != 'test':
return outputs
return fluid.layers.multiclass_nms(
bboxes=fluid.layers.concat(boxes, axis=1),
scores=fluid.layers.concat(scores, axis=2),
score_threshold=self.valid_thresh,
nms_top_k=self.nms_topk,
keep_top_k=self.nms_posk,
nms_threshold=self.nms_thresh,
background_label=-1)
class YoloLoss(Loss):
def __init__(self, num_classes=80, num_max_boxes=50):
super(YoloLoss, self).__init__()
self.num_classes = num_classes
self.num_max_boxes = num_max_boxes
self.ignore_thresh = 0.7
self.anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45,
59, 119, 116, 90, 156, 198, 373, 326]
self.anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
def forward(self, outputs, labels):
downsample = 32
gt_box, gt_label, gt_score = labels
losses = []
for idx, out in enumerate(outputs):
anchor_mask = self.anchor_masks[idx]
loss = fluid.layers.yolov3_loss(
x=out,
gt_box=gt_box,
gt_label=gt_label,
gt_score=gt_score,
anchor_mask=anchor_mask,
downsample_ratio=downsample,
anchors=self.anchors,
class_num=self.num_classes,
ignore_thresh=self.ignore_thresh,
use_label_smooth=True)
losses.append(loss)
downsample //= 2
return losses
def infer_shape(self, _):
return [
[None, self.num_max_boxes, 4],
[None, self.num_max_boxes],
[None, self.num_max_boxes]
]
def infer_dtype(self, _):
return ['float32', 'int32', 'float32']
def make_optimizer(parameter_list=None):
base_lr = 0.001
boundaries = [400000, 450000]
warm_up_iter = 4000
momentum = 0.9
weight_decay = 5e-4
values = [base_lr * (0.1 ** i) for i in range(len(boundaries) + 1)]
lr = fluid.layers.piecewise_decay(
boundaries=boundaries,
values=values)
lr = fluid.layers.linear_lr_warmup(
learning_rate=lr,
warmup_steps=warm_up_iter,
start_lr=0.0,
end_lr=base_lr)
optimizer = fluid.optimizer.Momentum(
learning_rate=lr,
regularization=fluid.regularizer.L2Decay(weight_decay),
momentum=momentum,
parameter_list=parameter_list)
return optimizer
def _iou_matrix(a, b):
tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2])
br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2)
area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
area_o = (area_a[:, np.newaxis] + area_b - area_i)
return area_i / (area_o + 1e-10)
def _crop_box_with_center_constraint(box, crop):
cropped_box = box.copy()
cropped_box[:, :2] = np.maximum(box[:, :2], crop[:2])
cropped_box[:, 2:] = np.minimum(box[:, 2:], crop[2:])
cropped_box[:, :2] -= crop[:2]
cropped_box[:, 2:] -= crop[:2]
centers = (box[:, :2] + box[:, 2:]) / 2
valid = np.logical_and(
crop[:2] <= centers, centers < crop[2:]).all(axis=1)
valid = np.logical_and(
valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1))
return cropped_box, np.where(valid)[0]
def random_crop(inputs):
aspect_ratios = [.5, 2.]
thresholds = [.0, .1, .3, .5, .7, .9]
scaling = [.3, 1.]
img, gt_box, gt_label = inputs
h, w = img.shape[:2]
if len(gt_box) == 0:
return inputs
np.random.shuffle(thresholds)
for thresh in thresholds:
found = False
for i in range(50):
scale = np.random.uniform(*scaling)
min_ar, max_ar = aspect_ratios
ar = np.random.uniform(max(min_ar, scale**2),
min(max_ar, scale**-2))
crop_h = int(h * scale / np.sqrt(ar))
crop_w = int(w * scale * np.sqrt(ar))
crop_y = np.random.randint(0, h - crop_h)
crop_x = np.random.randint(0, w - crop_w)
crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h]
iou = _iou_matrix(gt_box, np.array([crop_box], dtype=np.float32))
if iou.max() < thresh:
continue
cropped_box, valid_ids = _crop_box_with_center_constraint(
gt_box, np.array(crop_box, dtype=np.float32))
if valid_ids.size > 0:
found = True
break
if found:
x1, y1, x2, y2 = crop_box
img = img[y1:y2, x1:x2, :]
gt_box = np.take(cropped_box, valid_ids, axis=0)
gt_label = np.take(gt_label, valid_ids, axis=0)
return img, gt_box, gt_label
return inputs
# XXX mix up, color distort and random expand are skipped for simplicity
def sample_transform(inputs, mode='train', num_max_boxes=50):
if mode == 'train':
img, gt_box, gt_label = random_crop(inputs)
else:
img, gt_box, gt_label = inputs
h, w = img.shape[:2]
# random flip
if mode == 'train' and np.random.uniform(0., 1.) > .5:
img = img[:, ::-1, :]
if len(gt_box) > 0:
swap = gt_box.copy()
gt_box[:, 0] = w - swap[:, 2] - 1
gt_box[:, 2] = w - swap[:, 0] - 1
if len(gt_label) == 0:
gt_box = np.zeros([num_max_boxes, 4], dtype=np.float32)
gt_label = np.zeros([num_max_boxes, 1], dtype=np.int32)
return img, gt_box, gt_label
gt_box = gt_box[:num_max_boxes, :]
gt_label = gt_label[:num_max_boxes, 0]
# normalize boxes
gt_box /= np.array([w, h] * 2, dtype=np.float32)
gt_box[:, 2:] = gt_box[:, 2:] - gt_box[:, :2]
gt_box[:, :2] = gt_box[:, :2] + gt_box[:, 2:] / 2.
pad = num_max_boxes - gt_label.size
gt_box = np.pad(gt_box, ((0, pad), (0, 0)), mode='constant')
gt_label = np.pad(gt_label, [(0, pad)], mode='constant')
return img, gt_box, gt_label
def batch_transform(batch, mode='train'):
if mode == 'train':
d = np.random.choice(
[320, 352, 384, 416, 448, 480, 512, 544, 576, 608])
interp = np.random.choice(range(5))
else:
d = 608
interp = cv2.INTER_CUBIC
# transpose batch
imgs, gt_boxes, gt_labels = list(zip(*batch))
imgs = np.array([cv2.resize(
img, (d, d), interpolation=interp) for img in imgs])
# transpose, permute and normalize
imgs = imgs.astype(np.float32)[..., ::-1]
mean = np.array([123.675, 116.28, 103.53], dtype=np.float32)
std = np.array([58.395, 57.120, 57.375], dtype=np.float32)
invstd = 1. / std
imgs -= mean
imgs *= invstd
imgs = imgs.transpose((0, 3, 1, 2))
im_shapes = np.full([len(imgs), 2], d, dtype=np.int32)
gt_boxes = np.array(gt_boxes)
gt_labels = np.array(gt_labels)
# XXX since mix up is not used, scores are all 1s
gt_scores = np.ones_like(gt_labels, dtype=np.float32)
return [imgs, im_shapes], [gt_boxes, gt_labels, gt_scores]
def coco2017(root_dir, mode='train'):
json_path = os.path.join(
root_dir, 'annotations/instances_{}2017.json'.format(mode))
coco = COCO(json_path)
img_ids = coco.getImgIds()
imgs = coco.loadImgs(img_ids)
class_map = {v: i + 1 for i, v in enumerate(coco.getCatIds())}
samples = []
for img in imgs:
img_path = os.path.join(
root_dir, '{}2017'.format(mode), img['file_name'])
file_path = img_path
width = img['width']
height = img['height']
ann_ids = coco.getAnnIds(imgIds=img['id'], iscrowd=False)
anns = coco.loadAnns(ann_ids)
gt_box = []
gt_label = []
for ann in anns:
x1, y1, w, h = ann['bbox']
x2 = x1 + w - 1
y2 = y1 + h - 1
x1 = np.clip(x1, 0, width - 1)
x2 = np.clip(x2, 0, width - 1)
y1 = np.clip(y1, 0, height - 1)
y2 = np.clip(y2, 0, height - 1)
if ann['area'] <= 0 or x2 < x1 or y2 < y1:
continue
gt_label.append(ann['category_id'])
gt_box.append([x1, y1, x2, y2])
gt_box = np.array(gt_box, dtype=np.float32)
gt_label = np.array([class_map[cls] for cls in gt_label],
dtype=np.int32)[:, np.newaxis]
if gt_label.size == 0 and not mode == 'train':
continue
samples.append((file_path, gt_box.copy(), gt_label.copy()))
def iterator():
if mode == 'train':
random.shuffle(samples)
for file_path, gt_box, gt_label in samples:
img = cv2.imread(file_path)
yield img, gt_box, gt_label
return iterator
# XXX coco metrics not included for simplicity
def run(model, loader, mode='train'):
total_loss = 0.0
total_time = 0.
device_ids = list(range(FLAGS.num_devices))
start = time.time()
for idx, batch in enumerate(loader()):
outputs, losses = getattr(model, mode)(
batch[0], batch[1], device='gpu', device_ids=device_ids)
total_loss += np.sum(losses)
if idx > 1: # skip first two step
total_time += time.time() - start
if idx % 10 == 0:
print("{:04d}: loss {:0.3f} time: {:0.3f}".format(
idx, total_loss / (idx + 1), total_time / (idx - 1)))
start = time.time()
def main():
@contextlib.contextmanager
def null_guard():
yield
epoch = FLAGS.epoch
batch_size = FLAGS.batch_size
if FLAGS.dynamic:
guard = fluid.dygraph.guard()
else:
guard = null_guard()
train_loader = fluid.io.xmap_readers(
lambda batch: batch_transform(batch, 'train'),
paddle.batch(
fluid.io.xmap_readers(
lambda inputs: sample_transform(inputs, 'train'),
coco2017(FLAGS.data, 'train'),
process_num=8,
buffer_size=4 * batch_size),
batch_size=batch_size,
drop_last=True),
process_num=2, buffer_size=4)
val_loader = fluid.io.xmap_readers(
lambda batch: batch_transform(batch, 'train'),
paddle.batch(
fluid.io.xmap_readers(
lambda inputs: sample_transform(inputs, 'val'),
coco2017(FLAGS.data, 'val'),
process_num=8,
buffer_size=4 * batch_size),
batch_size=batch_size),
process_num=2, buffer_size=4)
if not os.path.exists('yolo_checkpoints'):
os.mkdir('yolo_checkpoints')
with guard:
model = YOLOv3()
# XXX transfer learning
if FLAGS.weights is not None:
model.backbone.load(FLAGS.weights)
optim = make_optimizer(parameter_list=model.parameters())
model.prepare(optim, YoloLoss())
for e in range(epoch):
print("======== train epoch {} ========".format(e))
run(model, train_loader)
model.save('checkpoints/{:02d}'.format(e))
print("======== eval epoch {} ========".format(e))
run(model, val_loader, mode='eval')
if __name__ == '__main__':
parser = argparse.ArgumentParser("Yolov3 Training on COCO")
parser.add_argument('data', metavar='DIR', help='path to COCO dataset')
parser.add_argument(
"-e", "--epoch", default=300, type=int, help="number of epoch")
parser.add_argument(
"-b", "--batch_size", default=32, type=int, help="batch size")
parser.add_argument(
"-n", "--num_devices", default=8, type=int, help="number of devices")
parser.add_argument(
"-d", "--dynamic", action='store_true', help="enable dygraph mode")
parser.add_argument(
"-w", "--weights", default=None, type=str,
help="path to pretrained weights")
FLAGS = parser.parse_args()
main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册