modeling.py 12.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import division
from __future__ import print_function

18
import paddle
19
import paddle.fluid as fluid
D
dengkaipeng 已提交
20
from paddle.fluid.dygraph.nn import Conv2D, BatchNorm
21 22 23
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.regularizer import L2Decay

24 25 26
from paddle.static import InputSpec
from paddle.utils.download import get_weights_path_from_url

L
LielinJiang 已提交
27
from darknet import darknet53
28 29 30 31 32 33 34 35 36 37

__all__ = ['YoloLoss', 'YOLOv3', 'yolov3_darknet53']

# {num_layers: (url, md5)}
pretrain_infos = {
    53: ('https://paddlemodels.bj.bcebos.com/hapi/yolov3_darknet53.pdparams',
         'aed7dd45124ff2e844ae3bd5ba6c91d2')
}


D
dengkaipeng 已提交
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
class ConvBNLayer(fluid.dygraph.Layer):
    def __init__(self,
                 ch_in,
                 ch_out,
                 filter_size=3,
                 stride=1,
                 groups=1,
                 padding=0,
                 act="leaky"):
        super(ConvBNLayer, self).__init__()

        self.conv = Conv2D(
            num_channels=ch_in,
            num_filters=ch_out,
            filter_size=filter_size,
            stride=stride,
            padding=padding,
            groups=groups,
            param_attr=ParamAttr(
                initializer=fluid.initializer.Normal(0., 0.02)),
            bias_attr=False,
            act=None)
        self.batch_norm = BatchNorm(
            num_channels=ch_out,
            param_attr=ParamAttr(
                initializer=fluid.initializer.Normal(0., 0.02),
                regularizer=L2Decay(0.)),
            bias_attr=ParamAttr(
                initializer=fluid.initializer.Constant(0.0),
                regularizer=L2Decay(0.)))

        self.act = act

    def forward(self, inputs):
        out = self.conv(inputs)
        out = self.batch_norm(out)
        if self.act == 'leaky':
            out = fluid.layers.leaky_relu(x=out, alpha=0.1)
        return out

L
LielinJiang 已提交
78

79 80 81 82 83 84 85 86
class YoloDetectionBlock(fluid.dygraph.Layer):
    def __init__(self, ch_in, channel):
        super(YoloDetectionBlock, self).__init__()

        assert channel % 2 == 0, \
            "channel {} cannot be divided by 2".format(channel)

        self.conv0 = ConvBNLayer(
L
LielinJiang 已提交
87
            ch_in=ch_in, ch_out=channel, filter_size=1, stride=1, padding=0)
88 89
        self.conv1 = ConvBNLayer(
            ch_in=channel,
L
LielinJiang 已提交
90
            ch_out=channel * 2,
91 92 93 94
            filter_size=3,
            stride=1,
            padding=1)
        self.conv2 = ConvBNLayer(
L
LielinJiang 已提交
95
            ch_in=channel * 2,
96 97 98 99 100 101
            ch_out=channel,
            filter_size=1,
            stride=1,
            padding=0)
        self.conv3 = ConvBNLayer(
            ch_in=channel,
L
LielinJiang 已提交
102
            ch_out=channel * 2,
103 104 105 106
            filter_size=3,
            stride=1,
            padding=1)
        self.route = ConvBNLayer(
L
LielinJiang 已提交
107
            ch_in=channel * 2,
108 109 110 111 112 113
            ch_out=channel,
            filter_size=1,
            stride=1,
            padding=0)
        self.tip = ConvBNLayer(
            ch_in=channel,
L
LielinJiang 已提交
114
            ch_out=channel * 2,
115 116 117 118 119 120 121 122 123 124 125 126 127 128
            filter_size=3,
            stride=1,
            padding=1)

    def forward(self, inputs):
        out = self.conv0(inputs)
        out = self.conv1(out)
        out = self.conv2(out)
        out = self.conv3(out)
        route = self.route(out)
        tip = self.tip(route)
        return route, tip


129
class YOLOv3(fluid.dygraph.Layer):
130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
    """YOLOv3 model from
    `"YOLOv3: An Incremental Improvement" <https://arxiv.org/abs/1804.02767>`_

    Args:
        num_classes (int): class number, default 80.
        model_mode (str): 'train', 'eval', 'test' mode, network structure
            will be diffrent in the output layer and data, in 'train' mode,
            no output layer append, in 'eval' and 'test', output feature
            map will be decode to predictions by 'fluid.layers.yolo_box',
            in 'eval' mode, return feature maps and predictions, in 'test'
            mode, only return predictions. Default 'train'.

    """

    def __init__(self, num_classes=80, model_mode='train'):
        super(YOLOv3, self).__init__()
        self.num_classes = num_classes
        assert str.lower(model_mode) in ['train', 'eval', 'test'], \
            "model_mode should be 'train' 'eval' or 'test', but got " \
            "{}".format(model_mode)
        self.model_mode = str.lower(model_mode)
L
LielinJiang 已提交
151 152 153 154
        self.anchors = [
            10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198,
            373, 326
        ]
155 156 157 158 159 160 161
        self.anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
        self.valid_thresh = 0.005
        self.nms_thresh = 0.45
        self.nms_topk = 400
        self.nms_posk = 100
        self.draw_thresh = 0.5

D
dengkaipeng 已提交
162
        self.backbone = darknet53(pretrained=(model_mode == 'train'))
163 164 165 166 167 168 169 170 171 172 173 174 175 176
        self.block_outputs = []
        self.yolo_blocks = []
        self.route_blocks = []

        for idx, num_chan in enumerate([1024, 768, 384]):
            yolo_block = self.add_sublayer(
                "yolo_detecton_block_{}".format(idx),
                YoloDetectionBlock(num_chan, 512 // (2**idx)))
            self.yolo_blocks.append(yolo_block)

            num_filters = len(self.anchor_masks[idx]) * (self.num_classes + 5)

            block_out = self.add_sublayer(
                "block_out_{}".format(idx),
L
LielinJiang 已提交
177 178 179 180 181 182 183 184 185 186
                Conv2D(
                    num_channels=1024 // (2**idx),
                    num_filters=num_filters,
                    filter_size=1,
                    act=None,
                    param_attr=ParamAttr(
                        initializer=fluid.initializer.Normal(0., 0.02)),
                    bias_attr=ParamAttr(
                        initializer=fluid.initializer.Constant(0.0),
                        regularizer=L2Decay(0.))))
187 188 189 190
            self.block_outputs.append(block_out)
            if idx < 2:
                route = self.add_sublayer(
                    "route2_{}".format(idx),
L
LielinJiang 已提交
191 192 193 194 195
                    ConvBNLayer(
                        ch_in=512 // (2**idx),
                        ch_out=256 // (2**idx),
                        filter_size=1,
                        act='leaky_relu'))
196 197 198 199 200 201 202 203
                self.route_blocks.append(route)

    def forward(self, img_id, img_shape, inputs):
        outputs = []
        boxes = []
        scores = []
        downsample = 32

204
        feats = self.backbone(inputs)
205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238
        route = None
        for idx, feat in enumerate(feats):
            if idx > 0:
                feat = fluid.layers.concat(input=[route, feat], axis=1)
            route, tip = self.yolo_blocks[idx](feat)
            block_out = self.block_outputs[idx](tip)
            outputs.append(block_out)

            if idx < 2:
                route = self.route_blocks[idx](route)
                route = fluid.layers.resize_nearest(route, scale=2)

            if self.model_mode != 'train':
                anchor_mask = self.anchor_masks[idx]
                mask_anchors = []
                for m in anchor_mask:
                    mask_anchors.append(self.anchors[2 * m])
                    mask_anchors.append(self.anchors[2 * m + 1])
                b, s = fluid.layers.yolo_box(
                    x=block_out,
                    img_size=img_shape,
                    anchors=mask_anchors,
                    class_num=self.num_classes,
                    conf_thresh=self.valid_thresh,
                    downsample_ratio=downsample)

                boxes.append(b)
                scores.append(fluid.layers.transpose(s, perm=[0, 2, 1]))

            downsample //= 2

        if self.model_mode == 'train':
            return outputs

L
LielinJiang 已提交
239 240 241 242 243 244 245 246 247 248 249 250
        preds = [
            img_id, fluid.layers.multiclass_nms(
                bboxes=fluid.layers.concat(
                    boxes, axis=1),
                scores=fluid.layers.concat(
                    scores, axis=2),
                score_threshold=self.valid_thresh,
                nms_top_k=self.nms_topk,
                keep_top_k=self.nms_posk,
                nms_threshold=self.nms_thresh,
                background_label=-1)
        ]
251 252 253 254 255 256 257

        if self.model_mode == 'test':
            return preds

        # model_mode == "eval"
        return outputs + preds

L
LielinJiang 已提交
258

259
class YoloLoss(fluid.dygraph.Layer):
260 261 262 263 264
    def __init__(self, num_classes=80, num_max_boxes=50):
        super(YoloLoss, self).__init__()
        self.num_classes = num_classes
        self.num_max_boxes = num_max_boxes
        self.ignore_thresh = 0.7
L
LielinJiang 已提交
265 266 267 268
        self.anchors = [
            10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198,
            373, 326
        ]
269 270
        self.anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]

271
    def forward(self, *inputs):
272 273 274
        downsample = 32
        losses = []

D
dengkaipeng 已提交
275
        # YOLOv3 output fields is different between 'train' and 'eval' mode
276 277 278 279 280 281
        if len(inputs) == 6:
            output1, output2, output3, gt_box, gt_label, gt_score = inputs
        elif len(inputs) == 8:
            output1, output2, output3, img_id, bbox, gt_box, gt_label, gt_score = inputs

        outputs = [output1, output2, output3]
282
        for idx, out in enumerate(outputs):
L
LielinJiang 已提交
283
            if idx == 3: break  # debug
284 285 286 287 288 289 290 291 292 293 294
            anchor_mask = self.anchor_masks[idx]
            loss = fluid.layers.yolov3_loss(
                x=out,
                gt_box=gt_box,
                gt_label=gt_label,
                gt_score=gt_score,
                anchor_mask=anchor_mask,
                downsample_ratio=downsample,
                anchors=self.anchors,
                class_num=self.num_classes,
                ignore_thresh=self.ignore_thresh,
D
dengkaipeng 已提交
295
                use_label_smooth=False)
296 297 298 299 300 301
            loss = fluid.layers.reduce_mean(loss)
            losses.append(loss)
            downsample //= 2
        return losses


L
LielinJiang 已提交
302 303
def _yolov3_darknet(num_layers=53,
                    num_classes=80,
304
                    num_max_boxes=50,
L
LielinJiang 已提交
305 306
                    model_mode='train',
                    pretrained=True):
307 308 309 310 311 312 313 314 315 316 317 318 319 320
    inputs = [
        InputSpec(
            [None, 1], 'int64', name='img_id'), InputSpec(
                [None, 2], 'int32', name='img_shape'), InputSpec(
                    [None, 3, None, None], 'float32', name='image')
    ]
    labels = [
        InputSpec(
            [None, num_max_boxes, 4], 'float32', name='gt_bbox'), InputSpec(
                [None, num_max_boxes], 'int32', name='gt_label'), InputSpec(
                    [None, num_max_boxes], 'float32', name='gt_score')
    ]
    net = YOLOv3(num_classes, model_mode)
    model = paddle.Model(net, inputs, labels)
321 322 323 324
    if pretrained:
        assert num_layers in pretrain_infos.keys(), \
                "YOLOv3-DarkNet{} do not have pretrained weights now, " \
                "pretrained should be set as False".format(num_layers)
L
LielinJiang 已提交
325
        weight_path = get_weights_path_from_url(*(pretrain_infos[num_layers]))
326 327
        assert weight_path.endswith('.pdparams'), \
                "suffix of weight must be .pdparams"
D
dengkaipeng 已提交
328
        model.load(weight_path)
329 330 331
    return model


332 333 334 335
def yolov3_darknet53(num_classes=80,
                     num_max_boxes=50,
                     model_mode='train',
                     pretrained=True):
336 337 338 339
    """YOLOv3 model with 53-layer DarkNet as backbone
    
    Args:
        num_classes (int): class number, default 80.
340
        num_classes (int): max bbox number in a image, default 50.
341 342 343 344 345 346 347 348 349
        model_mode (str): 'train', 'eval', 'test' mode, network structure
            will be diffrent in the output layer and data, in 'train' mode,
            no output layer append, in 'eval' and 'test', output feature
            map will be decode to predictions by 'fluid.layers.yolo_box',
            in 'eval' mode, return feature maps and predictions, in 'test'
            mode, only return predictions. Default 'train'.
        pretrained (bool): If True, returns a model with pre-trained model
            on COCO, default True
    """
350 351
    return _yolov3_darknet(53, num_classes, num_max_boxes, model_mode,
                           pretrained)