diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e4da0ddd534262daaa0a5febb0b830369d648179..04d62005bcd871d8b2e655f62695a59b9296e7e0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -47,7 +47,3 @@ jobs: exit $pylint_ret fi echo "All lint steps passed!" - - - name: Import hubconf check - run: | - python -c "import hubconf" diff --git a/README.md b/README.md index 09b9cbc0a5b1d6fcd379d628783dd798f7b019af..92bfcaedad5fd55a4308af85838967727a4c7416 100644 --- a/README.md +++ b/README.md @@ -75,9 +75,10 @@ export PYTHONPATH=/path/to/models:$PYTHONPATH 目标检测同样是计算机视觉中的常见任务,我们提供了一个经典的目标检测模型[retinanet](./official/vision/detection),这个模型在**COCO验证集**上的测试结果如下: -| 模型 | mAP
@5-95 | -| :---: | :---: | -| retinanet-res50-1x-800size | 36.0 | +| 模型 | mAP
@5-95 | +| :---: | :---: | +| retinanet-res50-1x-800size | 36.0 | +| faster-rcnn-fpn-res50-1x-800size | 37.3 | ### 图像分割 diff --git a/hubconf.py b/hubconf.py index ab6e6c7088d82f4c280242e977fe45ce795c06a4..fc21310bed004f9b0c54a09b3ae1aa121e4d4ae4 100644 --- a/hubconf.py +++ b/hubconf.py @@ -28,10 +28,15 @@ from official.nlp.bert.model import ( wwm_cased_L_24_H_1024_A_16, ) +from official.vision.detection.faster_rcnn_fpn_res50_coco_1x_800size import ( + faster_rcnn_fpn_res50_coco_1x_800size, +) + from official.vision.detection.retinanet_res50_coco_1x_800size import ( retinanet_res50_coco_1x_800size, ) -from official.vision.detection.models import RetinaNet + +from official.vision.detection.models import FasterRCNN, RetinaNet from official.vision.detection.tools.test import DetEvaluator from official.vision.segmentation.deeplabv3plus import ( diff --git a/official/vision/detection/README.md b/official/vision/detection/README.md index 4b0481d230e2403cb44c1f155ba956cfae1ae417..ea78f84be8e67b34008cd612d46411a20105a6a4 100644 --- a/official/vision/detection/README.md +++ b/official/vision/detection/README.md @@ -1,20 +1,21 @@ -# Megengine RetinaNet +# Megengine Detection Models ## 介绍 -本目录包含了采用MegEngine实现的经典[RetinaNet](https://arxiv.org/pdf/1708.02002>)网络结构,同时提供了在COCO2017数据集上的完整训练和测试代码。 +本目录包含了采用MegEngine实现的经典网络结构,包括[RetinaNet](https://arxiv.org/pdf/1708.02002>)、[Faster R-CNN with FPN](https://arxiv.org/pdf/1612.03144.pdf)等,同时提供了在COCO2017数据集上的完整训练和测试代码。 -网络的性能在COCO2017验证集上的测试结果如下: +网络的性能在COCO2017数据集上的测试结果如下: -| 模型 | mAP
@5-95 | batch
/gpu | gpu | speed
(8gpu) | speed
(1gpu) | -| --- | --- | --- | --- | --- | --- | -| retinanet-res50-coco-1x-800size | 36.0 | 2 | 2080ti | 2.27(it/s) | 3.7(it/s) | +| 模型 | mAP
@5-95 | batch
/gpu | gpu | trainging speed
(8gpu) | training speed
(1gpu) | +| --- | --- | --- | --- | --- | --- | +| retinanet-res50-coco-1x-800size | 36.0 | 2 | 2080Ti | 2.27(it/s) | 3.7(it/s) | +| faster-rcnn-fpn-res50-coco-1x-800size | 37.3 | 2 | 2080Ti | 1.9(it/s) | 3.1(it/s) | * MegEngine v0.4.0 ## 如何使用 -模型训练好之后,可以通过如下命令测试单张图片: +以RetinaNet为例,模型训练好之后,可以通过如下命令测试单张图片: ```bash python3 tools/inference.py -f retinanet_res50_coco_1x_800size.py \ @@ -60,17 +61,33 @@ python3 tools/train.py -f retinanet_res50_coco_1x_800size.py \ `tools/train.py`提供了灵活的命令行选项,包括: -- `-f`, 所需要训练的网络结构描述文件。 +- `-f`, 所需要训练的网络结构描述文件。可以是RetinaNet、Faster R-CNN等. - `-n`, 用于训练的devices(gpu)数量,默认使用所有可用的gpu. - `-w`, 预训练的backbone网络权重的路径。 - `--batch_size`,训练时采用的`batch size`, 默认2,表示每张卡训2张图。 - `--dataset-dir`, COCO2017数据集的上级目录,默认`/data/datasets`。 -默认情况下模型会存在 `log-of-retinanet_res50_1x_800size`目录下。 +默认情况下模型会存在 `log-of-模型名`目录下。 + +5. 编译可能需要的lib + +GPU NMS位于tools下的GPU NMS文件夹下面,我们需要进入tools文件夹下进行编译. + +首先需要找到MegEngine编译的头文件所在路径,可以通过命令 + +```bash +python3 -c "import megengine as mge; print(mge.__file__)" +``` +将输出结果中__init__.py之前的部分复制(以MegEngine结尾),将其赋值给shell变量MGE,接下来,运行如下命令进行编译。 + +```bash +cd tools +nvcc -I $MGE/_internal/include -shared -o lib_nms.so -Xcompiler "-fno-strict-aliasing -fPIC" gpu_nms/nms.cu +``` ## 如何测试 -在训练的过程中,可以通过如下命令测试模型在`COCO2017`验证集的性能: +在得到训练完保存的模型之后,可以通过tools下的test.py文件测试模型在`COCO2017`验证集的性能: ```bash python3 tools/test.py -f retinanet_res50_coco_1x_800size.py \ @@ -89,5 +106,6 @@ python3 tools/test.py -f retinanet_res50_coco_1x_800size.py \ ## 参考文献 - [Focal Loss for Dense Object Detection](https://arxiv.org/pdf/1708.02002) Tsung-Yi Lin, Priya Goyal, Ross Girshick, Kaiming He, Piotr Dollár. Proceedings of the IEEE international conference on computer vision. 2017: 2980-2988. -- [Microsoft COCO: Common Objects in Context](https://arxiv.org/pdf/1405.0312.pdf) Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Dollár, Piotr and Zitnick, C Lawrence -Lin T Y, Maire M, Belongie S, et al. European conference on computer vision. Springer, Cham, 2014: 740-755. +- [Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks](https://arxiv.org/pdf/1506.01497.pdf) S. Ren, K. He, R. Girshick, and J. Sun. In: Neural Information Processing Systems(NIPS)(2015). +- [Feature Pyramid Networks for Object Detection](https://arxiv.org/pdf/1612.03144.pdf) T. Lin, P. Dollár, R. Girshick, K. He, B. Hariharan and S. Belongie. 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), Honolulu, HI, 2017, pp. 936-944, doi: 10.1109/CVPR.2017.106. +- [Microsoft COCO: Common Objects in Context](https://arxiv.org/pdf/1405.0312.pdf) Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Dollár, Piotr and Zitnick, C Lawrence, Lin T Y, Maire M, Belongie S, et al. European conference on computer vision. Springer, Cham, 2014: 740-755. diff --git a/official/vision/detection/faster_rcnn_fpn_res50_coco_1x_800size.py b/official/vision/detection/faster_rcnn_fpn_res50_coco_1x_800size.py new file mode 100644 index 0000000000000000000000000000000000000000..c085b073b8be405c443b7eed7c6820f73170ce0c --- /dev/null +++ b/official/vision/detection/faster_rcnn_fpn_res50_coco_1x_800size.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- +# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") +# +# Copyright (c) 2014-2020 Megvii Inc. All rights reserved. +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +from megengine import hub + +from official.vision.detection import models + + +@hub.pretrained( + "https://data.megengine.org.cn/models/weights/" + "faster_rcnn_fpn_ec2e80b9_res50_1x_800size_37dot3.pkl" +) +def faster_rcnn_fpn_res50_coco_1x_800size(batch_size=1, **kwargs): + r""" + Faster-RCNN FPN trained from COCO dataset. + `"Faster-RCNN" `_ + `"FPN" `_ + `"COCO" `_ + """ + return models.FasterRCNN(models.FasterRCNNConfig(), batch_size=batch_size, **kwargs) + + +Net = models.FasterRCNN +Cfg = models.FasterRCNNConfig diff --git a/official/vision/detection/faster_rcnn_fpn_res50_coco_1x_800size_syncbn.py b/official/vision/detection/faster_rcnn_fpn_res50_coco_1x_800size_syncbn.py new file mode 100644 index 0000000000000000000000000000000000000000..c557d46c3bef891b312644ecfb521a0b7b5d2389 --- /dev/null +++ b/official/vision/detection/faster_rcnn_fpn_res50_coco_1x_800size_syncbn.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") +# +# Copyright (c) 2014-2020 Megvii Inc. All rights reserved. +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +from megengine import hub + +from official.vision.detection import models + + +class CustomFasterRCNNFPNConfig(models.FasterRCNNConfig): + def __init__(self): + super().__init__() + + self.resnet_norm = "SyncBN" + self.fpn_norm = "SyncBN" + + +@hub.pretrained( + "https://data.megengine.org.cn/models/weights/" + "faster_rcnn_fpn_cf5c020b_res50_1x_800size_syncbn_37dot6.pkl" +) +def faster_rcnn_fpn_res50_coco_1x_800size_syncbn(batch_size=1, **kwargs): + r""" + Faster-RCNN FPN trained from COCO dataset. + `"Faster-RCNN" `_ + `"FPN" `_ + `"COCO" `_ + `"SyncBN" `_ + """ + return models.FasterRCNN(CustomFasterRCNNFPNConfig(), batch_size=batch_size, **kwargs) + + +Net = models.FasterRCNN +Cfg = CustomFasterRCNNFPNConfig diff --git a/official/vision/detection/layers/basic/nn.py b/official/vision/detection/layers/basic/nn.py index bc2d98c07a6a7134bfd5e5ddd1a60f99c93c36b3..349df5292331ee1aec8cfecce83d24c7eb646bdd 100644 --- a/official/vision/detection/layers/basic/nn.py +++ b/official/vision/detection/layers/basic/nn.py @@ -22,7 +22,7 @@ # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # # This file has been modified by Megvii ("Megvii Modifications"). -# All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. +# All Megvii Modifications are Copyright (C) 2014-2020 Megvii Inc. All rights reserved. # --------------------------------------------------------------------- from collections import namedtuple diff --git a/official/vision/detection/layers/basic/norm.py b/official/vision/detection/layers/basic/norm.py index 5d0463a333c259267268c3ecdb8c3f17ba7a16c9..43a0917952761c048810bbcbfd883e68d9f080b5 100644 --- a/official/vision/detection/layers/basic/norm.py +++ b/official/vision/detection/layers/basic/norm.py @@ -22,7 +22,7 @@ # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # # This file has been modified by Megvii ("Megvii Modifications"). -# All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. +# All Megvii Modifications are Copyright (C) 2014-2020 Megvii Inc. All rights reserved. # --------------------------------------------------------------------- import megengine.module as M import numpy as np diff --git a/official/vision/detection/layers/det/__init__.py b/official/vision/detection/layers/det/__init__.py index 33d29cee66e4efbb81433b67fa898b3a8e844f40..9dfcc1648c1229a290ce0eb6eaa527cfb8d4dfdd 100644 --- a/official/vision/detection/layers/det/__init__.py +++ b/official/vision/detection/layers/det/__init__.py @@ -10,7 +10,10 @@ from .anchor import * from .box_utils import * from .fpn import * from .loss import * +from .pooler import * +from .rcnn import * from .retinanet import * +from .rpn import * _EXCLUDE = {} __all__ = [k for k in globals().keys() if k not in _EXCLUDE and not k.startswith("_")] diff --git a/official/vision/detection/layers/det/anchor.py b/official/vision/detection/layers/det/anchor.py index 4ceb756c27e97e30d355fbec557952cb7cc8e18e..505553f753b987d0d4ddd78c5b27ba63cdea498c 100644 --- a/official/vision/detection/layers/det/anchor.py +++ b/official/vision/detection/layers/det/anchor.py @@ -1,18 +1,4 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2019 Open-MMLab. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# --------------------------------------------------------------------- # MegEngine is Licensed under the Apache License, Version 2.0 (the "License") # # Copyright (c) 2014-2020 Megvii Inc. All rights reserved. @@ -20,10 +6,6 @@ # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# -# This file has been modified by Megvii ("Megvii Modifications"). -# All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. -# --------------------------------------------------------------------- from abc import ABCMeta, abstractmethod import megengine.functional as F @@ -132,8 +114,7 @@ class DefaultAnchorGenerator(BaseAnchorGenerator): [flatten_shift_x, flatten_shift_y, flatten_shift_x, flatten_shift_y, ], axis=1, ) - if self.offset > 0: - centers = centers + self.offset * stride + centers = centers + self.offset * self.base_size return centers def get_anchors_by_feature(self, featmap, stride): diff --git a/official/vision/detection/layers/det/box_utils.py b/official/vision/detection/layers/det/box_utils.py index aa0ec10b6eab5ca0fda6ba620717066251e6665d..4d03f5e8b1e0ac86a15320f6ea5b065cf317df1e 100644 --- a/official/vision/detection/layers/det/box_utils.py +++ b/official/vision/detection/layers/det/box_utils.py @@ -112,12 +112,12 @@ class BoxCoder(BoxCoderBase, metaclass=ABCMeta): pred_y2 = pred_ctr_y + 0.5 * pred_height pred_box = self._concat_new_axis(pred_x1, pred_y1, pred_x2, pred_y2, 2) - pred_box = pred_box.reshape(pred_box.shape[0], -1) + pred_box = pred_box.reshape(pred_box.shapeof(0), -1) return pred_box -def get_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor: +def get_iou(boxes1: Tensor, boxes2: Tensor, return_ignore=False) -> Tensor: """ Given two lists of boxes of size N and M, compute the IoU (intersection over union) @@ -132,10 +132,10 @@ def get_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor: """ box = boxes1 gt = boxes2 - target_shape = (boxes1.shape[0], boxes2.shapeof()[0], 4) + target_shape = (boxes1.shapeof(0), boxes2.shapeof(0), 4) b_box = F.add_axis(boxes1, 1).broadcast(*target_shape) - b_gt = F.add_axis(boxes2, 0).broadcast(*target_shape) + b_gt = F.add_axis(boxes2[:, :4], 0).broadcast(*target_shape) iw = F.minimum(b_box[:, :, 2], b_gt[:, :, 2]) - F.maximum( b_box[:, :, 0], b_gt[:, :, 0] @@ -148,7 +148,7 @@ def get_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor: area_box = (box[:, 2] - box[:, 0]) * (box[:, 3] - box[:, 1]) area_gt = (gt[:, 2] - gt[:, 0]) * (gt[:, 3] - gt[:, 1]) - area_target_shape = (box.shape[0], gt.shapeof()[0]) + area_target_shape = (box.shapeof(0), gt.shapeof(0)) b_area_box = F.add_axis(area_box, 1).broadcast(*area_target_shape) b_area_gt = F.add_axis(area_gt, 0).broadcast(*area_target_shape) @@ -156,20 +156,34 @@ def get_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor: union = b_area_box + b_area_gt - inter overlaps = F.maximum(inter / union, 0) + if return_ignore: + overlaps_ignore = F.maximum(inter / b_area_box, 0) + gt_ignore_mask = F.add_axis((gt[:, 4] == -1), 0).broadcast(*area_target_shape) + overlaps *= (1 - gt_ignore_mask) + overlaps_ignore *= gt_ignore_mask + return overlaps, overlaps_ignore + return overlaps def get_clipped_box(boxes, hw): """ Clip the boxes into the image region.""" # x1 >=0 - box_x1 = F.maximum(F.minimum(boxes[:, 0::4], hw[1]), 0) + box_x1 = F.clamp(boxes[:, 0::4], lower=0, upper=hw[1]) # y1 >=0 - box_y1 = F.maximum(F.minimum(boxes[:, 1::4], hw[0]), 0) + box_y1 = F.clamp(boxes[:, 1::4], lower=0, upper=hw[0]) # x2 < im_info[1] - box_x2 = F.maximum(F.minimum(boxes[:, 2::4], hw[1]), 0) + box_x2 = F.clamp(boxes[:, 2::4], lower=0, upper=hw[1]) # y2 < im_info[0] - box_y2 = F.maximum(F.minimum(boxes[:, 3::4], hw[0]), 0) + box_y2 = F.clamp(boxes[:, 3::4], lower=0, upper=hw[0]) clip_box = F.concat([box_x1, box_y1, box_x2, box_y2], axis=1) return clip_box + + +def filter_boxes(boxes, size=0): + width = boxes[:, 2] - boxes[:, 0] + height = boxes[:, 3] - boxes[:, 1] + keep = (width > size) * (height > size) + return keep diff --git a/official/vision/detection/layers/det/fpn.py b/official/vision/detection/layers/det/fpn.py index 3978eac325e8e528fb2ff9ba21f97fff320bd778..b77f35ced9a6fbdf1fa386cb96c2913e658514ae 100644 --- a/official/vision/detection/layers/det/fpn.py +++ b/official/vision/detection/layers/det/fpn.py @@ -22,7 +22,7 @@ # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # # This file has been modified by Megvii ("Megvii Modifications"). -# All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved. +# All Megvii Modifications are Copyright (C) 2014-2020 Megvii Inc. All rights reserved. # --------------------------------------------------------------------- import math from typing import List @@ -47,6 +47,8 @@ class FPN(M.Module): out_channels: int = 256, norm: str = "", top_block: M.Module = None, + strides=[8, 16, 32], + channels=[512, 1024, 2048], ): """ Args: @@ -63,8 +65,8 @@ class FPN(M.Module): """ super(FPN, self).__init__() - in_strides = [8, 16, 32] - in_channels = [512, 1024, 2048] + in_strides = strides + in_channels = channels use_bias = norm == "" self.lateral_convs = list() @@ -148,33 +150,50 @@ class FPN(M.Module): top_block_in_feature = results[ self._out_features.index(self.top_block.in_feature) ] - results.extend(self.top_block(top_block_in_feature, results[-1])) + results.extend(self.top_block(top_block_in_feature)) return dict(zip(self._out_features, results)) def output_shape(self): return { - name: layers.ShapeSpec(channels=self._out_feature_channels[name],) + name: layers.ShapeSpec( + channels=self._out_feature_channels[name], + stride=self._out_feature_strides[name], + ) for name in self._out_features } +class FPNP6(M.Module): + """ + used in FPN, generate a downsampled P6 feature from P5. + """ + + def __init__(self, in_feature="p5"): + super().__init__() + self.num_levels = 1 + self.in_feature = in_feature + + def forward(self, x): + return [F.max_pool2d(x, kernel_size=1, stride=2, padding=0)] + + class LastLevelP6P7(M.Module): """ This module is used in RetinaNet to generate extra layers, P6 and P7 from C5 feature. """ - def __init__(self, in_channels: int, out_channels: int): + def __init__(self, in_channels: int, out_channels: int, in_feature="res5"): super().__init__() self.num_levels = 2 - self.in_feature = "res5" + if in_feature == "p5": + assert in_channels == out_channels + self.in_feature = in_feature self.p6 = M.Conv2d(in_channels, out_channels, 3, 2, 1) self.p7 = M.Conv2d(out_channels, out_channels, 3, 2, 1) - self.use_P5 = in_channels == out_channels - def forward(self, c5, p5=None): - x = p5 if self.use_P5 else c5 + def forward(self, x): p6 = self.p6(x) p7 = self.p7(F.relu(p6)) return [p6, p7] diff --git a/official/vision/detection/layers/det/loss.py b/official/vision/detection/layers/det/loss.py index 1d48adba183cc81c70994af2a59173d758398428..3fda1e4b4295572a24391bcac75913bffdb7a144 100644 --- a/official/vision/detection/layers/det/loss.py +++ b/official/vision/detection/layers/det/loss.py @@ -6,11 +6,9 @@ # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -import megengine as mge import megengine.functional as F -import numpy as np -from megengine.core import tensor, Tensor +from megengine.core import Tensor def get_focal_loss( @@ -112,7 +110,8 @@ def get_smooth_l1_loss( if norm_type == "fg": loss = (losses.sum(axis=1) * fg_mask).sum() / F.maximum(fg_mask.sum(), 1) elif norm_type == "all": - raise NotImplementedError + all_mask = (label != ignore_label) + loss = (losses.sum(axis=1) * fg_mask).sum() / F.maximum(all_mask.sum(), 1) else: raise NotImplementedError @@ -151,5 +150,19 @@ def get_smooth_l1_base( abs_x = F.abs(x) in_loss = 0.5 * x ** 2 * sigma2 out_loss = abs_x - 0.5 / sigma2 - loss = F.where(abs_x < cond_point, in_loss, out_loss) + + in_mask = abs_x < cond_point + out_mask = 1 - in_mask + loss = in_loss * in_mask + out_loss * out_mask + return loss + + +def softmax_loss(score, label, ignore_label=-1): + max_score = F.zero_grad(score.max(axis=1, keepdims=True)) + score -= max_score + log_prob = score - F.log(F.exp(score).sum(axis=1, keepdims=True)) + mask = (label != ignore_label) + vlabel = label * mask + loss = -(F.indexing_one_hot(log_prob, vlabel.astype("int32"), 1) * mask).sum() + loss = loss / F.maximum(mask.sum(), 1) return loss diff --git a/official/vision/detection/layers/det/pooler.py b/official/vision/detection/layers/det/pooler.py new file mode 100644 index 0000000000000000000000000000000000000000..d415b520a06676624d6645939b061deead331965 --- /dev/null +++ b/official/vision/detection/layers/det/pooler.py @@ -0,0 +1,63 @@ +# -*- coding:utf-8 -*- +# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") +# +# Copyright (c) 2014-2020 Megvii Inc. All rights reserved. +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +import math + +import numpy as np +import megengine as mge +import megengine.functional as F + + +def roi_pool( + rpn_fms, rois, stride, pool_shape, roi_type='roi_align', +): + assert len(stride) == len(rpn_fms) + canonical_level = 4 + canonical_box_size = 224 + min_level = math.log2(stride[0]) + max_level = math.log2(stride[-1]) + + num_fms = len(rpn_fms) + box_area = (rois[:, 3] - rois[:, 1]) * (rois[:, 4] - rois[:, 2]) + level_assignments = F.floor( + canonical_level + F.log(box_area.sqrt() / canonical_box_size) / np.log(2) + ) + level_assignments = F.minimum(level_assignments, max_level) + level_assignments = F.maximum(level_assignments, min_level) + level_assignments = level_assignments - min_level + + # avoid empty assignment + level_assignments = F.concat( + [level_assignments, mge.tensor(np.arange(num_fms, dtype=np.int32))], + ) + rois = F.concat([rois, mge.zeros((num_fms, rois.shapeof(-1)))]) + + pool_list, inds_list = [], [] + for i in range(num_fms): + mask = (level_assignments == i) + _, inds = F.cond_take(mask == 1, mask) + level_rois = rois.ai[inds] + if roi_type == 'roi_pool': + pool_fm = F.roi_pooling( + rpn_fms[i], level_rois, pool_shape, + mode='max', scale=1.0/stride[i] + ) + elif roi_type == 'roi_align': + pool_fm = F.roi_align( + rpn_fms[i], level_rois, pool_shape, mode='average', + spatial_scale=1.0/stride[i], sample_points=2, aligned=True + ) + pool_list.append(pool_fm) + inds_list.append(inds) + + fm_order = F.concat(inds_list, axis=0) + fm_order = F.argsort(fm_order.reshape(1, -1))[1].reshape(-1) + pool_feature = F.concat(pool_list, axis=0) + pool_feature = pool_feature.ai[fm_order][:-num_fms] + + return pool_feature diff --git a/official/vision/detection/layers/det/rcnn.py b/official/vision/detection/layers/det/rcnn.py new file mode 100644 index 0000000000000000000000000000000000000000..a52edcf8ffeed4cda75822275934e905d7eedfcc --- /dev/null +++ b/official/vision/detection/layers/det/rcnn.py @@ -0,0 +1,175 @@ +# -*- coding:utf-8 -*- +# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") +# +# Copyright (c) 2014-2020 Megvii Inc. All rights reserved. +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +import megengine as mge +import megengine.functional as F +import megengine.module as M + +from official.vision.detection import layers + + +class RCNN(M.Module): + + def __init__(self, cfg): + super().__init__() + self.cfg = cfg + self.box_coder = layers.BoxCoder( + reg_mean=cfg.bbox_normalize_means, + reg_std=cfg.bbox_normalize_stds + ) + + # roi head + self.in_features = cfg.rcnn_in_features + self.stride = cfg.rcnn_stride + self.pooling_method = cfg.pooling_method + self.pooling_size = cfg.pooling_size + + self.fc1 = M.Linear(256 * self.pooling_size[0] * self.pooling_size[1], 1024) + self.fc2 = M.Linear(1024, 1024) + for l in [self.fc1, self.fc2]: + M.init.normal_(l.weight, std=0.01) + M.init.fill_(l.bias, 0) + + # box predictor + self.pred_cls = M.Linear(1024, cfg.num_classes + 1) + self.pred_delta = M.Linear(1024, (cfg.num_classes + 1) * 4) + M.init.normal_(self.pred_cls.weight, std=0.01) + M.init.normal_(self.pred_delta.weight, std=0.001) + for l in [self.pred_cls, self.pred_delta]: + M.init.fill_(l.bias, 0) + + def forward(self, fpn_fms, rcnn_rois, im_info=None, gt_boxes=None): + rcnn_rois, labels, bbox_targets = self.get_ground_truth(rcnn_rois, im_info, gt_boxes) + + fpn_fms = [fpn_fms[x] for x in self.in_features] + pool_features = layers.roi_pool( + fpn_fms, rcnn_rois, self.stride, + self.pooling_size, self.pooling_method, + ) + flatten_feature = F.flatten(pool_features, start_axis=1) + roi_feature = F.relu(self.fc1(flatten_feature)) + roi_feature = F.relu(self.fc2(roi_feature)) + pred_cls = self.pred_cls(roi_feature) + pred_delta = self.pred_delta(roi_feature) + + if self.training: + # loss for classification + loss_rcnn_cls = layers.softmax_loss(pred_cls, labels) + # loss for regression + pred_delta = pred_delta.reshape(-1, self.cfg.num_classes + 1, 4) + + vlabels = labels.reshape(-1, 1).broadcast((labels.shapeof(0), 4)) + pred_delta = F.indexing_one_hot(pred_delta, vlabels, axis=1) + + loss_rcnn_loc = layers.get_smooth_l1_loss( + pred_delta, bbox_targets, labels, + self.cfg.rcnn_smooth_l1_beta, + norm_type="all", + ) + loss_dict = { + 'loss_rcnn_cls': loss_rcnn_cls, + 'loss_rcnn_loc': loss_rcnn_loc + } + return loss_dict + else: + # slice 1 for removing background + pred_scores = F.softmax(pred_cls, axis=1)[:, 1:] + pred_delta = pred_delta[:, 4:].reshape(-1, 4) + target_shape = (rcnn_rois.shapeof(0), self.cfg.num_classes, 4) + # rois (N, 4) -> (N, 1, 4) -> (N, 80, 4) -> (N * 80, 4) + base_rois = F.add_axis(rcnn_rois[:, 1:5], 1).broadcast(target_shape).reshape(-1, 4) + pred_bbox = self.box_coder.decode(base_rois, pred_delta) + return pred_bbox, pred_scores + + def get_ground_truth(self, rpn_rois, im_info, gt_boxes): + if not self.training: + return rpn_rois, None, None + + return_rois = [] + return_labels = [] + return_bbox_targets = [] + + # get per image proposals and gt_boxes + for bid in range(self.cfg.batch_per_gpu): + num_valid_boxes = im_info[bid, 4] + gt_boxes_per_img = gt_boxes[bid, :num_valid_boxes, :] + batch_inds = mge.ones((gt_boxes_per_img.shapeof(0), 1)) * bid + # if config.proposal_append_gt: + gt_rois = F.concat([batch_inds, gt_boxes_per_img[:, :4]], axis=1) + batch_roi_mask = (rpn_rois[:, 0] == bid) + _, batch_roi_inds = F.cond_take(batch_roi_mask == 1, batch_roi_mask) + # all_rois : [batch_id, x1, y1, x2, y2] + all_rois = F.concat([rpn_rois.ai[batch_roi_inds], gt_rois]) + + overlaps_normal, overlaps_ignore = layers.get_iou( + all_rois[:, 1:5], gt_boxes_per_img, return_ignore=True, + ) + + max_overlaps_normal = overlaps_normal.max(axis=1) + gt_assignment_normal = F.argmax(overlaps_normal, axis=1) + + max_overlaps_ignore = overlaps_ignore.max(axis=1) + gt_assignment_ignore = F.argmax(overlaps_ignore, axis=1) + + ignore_assign_mask = (max_overlaps_normal < self.cfg.fg_threshold) * ( + max_overlaps_ignore > max_overlaps_normal) + max_overlaps = ( + max_overlaps_normal * (1 - ignore_assign_mask) + + max_overlaps_ignore * ignore_assign_mask + ) + gt_assignment = ( + gt_assignment_normal * (1 - ignore_assign_mask) + + gt_assignment_ignore * ignore_assign_mask + ) + gt_assignment = gt_assignment.astype("int32") + labels = gt_boxes_per_img.ai[gt_assignment, 4] + + # ---------------- get the fg/bg labels for each roi ---------------# + fg_mask = (max_overlaps >= self.cfg.fg_threshold) * (labels != self.cfg.ignore_label) + bg_mask = (max_overlaps < self.cfg.bg_threshold_high) * ( + max_overlaps >= self.cfg.bg_threshold_low) + + num_fg_rois = self.cfg.num_rois * self.cfg.fg_ratio + + fg_inds_mask = self._bernoulli_sample_masks(fg_mask, num_fg_rois, 1) + num_bg_rois = self.cfg.num_rois - fg_inds_mask.sum() + bg_inds_mask = self._bernoulli_sample_masks(bg_mask, num_bg_rois, 1) + + labels = labels * fg_inds_mask + + keep_mask = fg_inds_mask + bg_inds_mask + _, keep_inds = F.cond_take(keep_mask == 1, keep_mask) + # Add next line to avoid memory exceed + keep_inds = keep_inds[:F.minimum(self.cfg.num_rois, keep_inds.shapeof(0))] + # labels + labels = labels.ai[keep_inds].astype("int32") + rois = all_rois.ai[keep_inds] + target_boxes = gt_boxes_per_img.ai[gt_assignment.ai[keep_inds], :4] + bbox_targets = self.box_coder.encode(rois[:, 1:5], target_boxes) + bbox_targets = bbox_targets.reshape(-1, 4) + + return_rois.append(rois) + return_labels.append(labels) + return_bbox_targets.append(bbox_targets) + + return ( + F.zero_grad(F.concat(return_rois, axis=0)), + F.zero_grad(F.concat(return_labels, axis=0)), + F.zero_grad(F.concat(return_bbox_targets, axis=0)) + ) + + def _bernoulli_sample_masks(self, masks, num_samples, sample_value): + """ Using the bernoulli sampling method""" + sample_mask = (masks == sample_value) + num_mask = sample_mask.sum() + num_final_samples = F.minimum(num_mask, num_samples) + # here, we use the bernoulli probability to sample the anchors + sample_prob = num_final_samples / num_mask + uniform_rng = mge.random.uniform(sample_mask.shapeof(0)) + after_sampled_mask = (uniform_rng <= sample_prob) * sample_mask + return after_sampled_mask diff --git a/official/vision/detection/layers/det/rpn.py b/official/vision/detection/layers/det/rpn.py new file mode 100644 index 0000000000000000000000000000000000000000..dfce3470465faab8b23663964dd8a09eaf5fe37d --- /dev/null +++ b/official/vision/detection/layers/det/rpn.py @@ -0,0 +1,290 @@ +# -*- coding:utf-8 -*- +# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") +# +# Copyright (c) 2014-2020 Megvii Inc. All rights reserved. +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +import megengine as mge +import megengine.random as rand +import megengine.functional as F +import megengine.module as M +from official.vision.detection import layers +from official.vision.detection.tools.gpu_nms import batched_nms + + +class RPN(M.Module): + + def __init__(self, cfg): + super().__init__() + self.cfg = cfg + self.box_coder = layers.BoxCoder() + + self.stride_list = cfg.rpn_stride + rpn_channel = cfg.rpn_channel + self.in_features = cfg.rpn_in_features + self.anchors_generator = layers.DefaultAnchorGenerator( + cfg.anchor_base_size, + cfg.anchor_scales, + cfg.anchor_aspect_ratios, + cfg.anchor_offset, + ) + self.rpn_conv = M.Conv2d(256, rpn_channel, kernel_size=3, stride=1, padding=1) + self.rpn_cls_score = M.Conv2d( + rpn_channel, cfg.num_cell_anchors * 2, + kernel_size=1, stride=1 + ) + self.rpn_bbox_offsets = M.Conv2d( + rpn_channel, cfg.num_cell_anchors * 4, + kernel_size=1, stride=1 + ) + + for l in [self.rpn_conv, self.rpn_cls_score, self.rpn_bbox_offsets]: + M.init.normal_(l.weight, std=0.01) + M.init.fill_(l.bias, 0) + + def forward(self, features, im_info, boxes=None): + # prediction + features = [features[x] for x in self.in_features] + + # get anchors + all_anchors_list = [ + self.anchors_generator(fm, stride) + for fm, stride in zip(features, self.stride_list) + ] + + pred_cls_score_list = [] + pred_bbox_offsets_list = [] + for x in features: + t = F.relu(self.rpn_conv(x)) + scores = self.rpn_cls_score(t) + pred_cls_score_list.append( + scores.reshape( + scores.shape[0], 2, self.cfg.num_cell_anchors, + scores.shape[2], scores.shape[3] + ) + ) + bbox_offsets = self.rpn_bbox_offsets(t) + pred_bbox_offsets_list.append( + bbox_offsets.reshape( + bbox_offsets.shape[0], self.cfg.num_cell_anchors, 4, + bbox_offsets.shape[2], bbox_offsets.shape[3] + ) + ) + # sample from the predictions + rpn_rois = self.find_top_rpn_proposals( + pred_bbox_offsets_list, pred_cls_score_list, + all_anchors_list, im_info + ) + + if self.training: + rpn_labels, rpn_bbox_targets = self.get_ground_truth( + boxes, im_info, all_anchors_list) + pred_cls_score, pred_bbox_offsets = self.merge_rpn_score_box( + pred_cls_score_list, pred_bbox_offsets_list + ) + + # rpn loss + loss_rpn_cls = layers.softmax_loss(pred_cls_score, rpn_labels) + loss_rpn_loc = layers.get_smooth_l1_loss( + pred_bbox_offsets, + rpn_bbox_targets, + rpn_labels, + self.cfg.rpn_smooth_l1_beta, + norm_type="all" + ) + loss_dict = { + "loss_rpn_cls": loss_rpn_cls, + "loss_rpn_loc": loss_rpn_loc + } + return rpn_rois, loss_dict + else: + return rpn_rois + + def find_top_rpn_proposals( + self, rpn_bbox_offsets_list, rpn_cls_prob_list, + all_anchors_list, im_info + ): + prev_nms_top_n = self.cfg.train_prev_nms_top_n \ + if self.training else self.cfg.test_prev_nms_top_n + post_nms_top_n = self.cfg.train_post_nms_top_n \ + if self.training else self.cfg.test_post_nms_top_n + + batch_per_gpu = self.cfg.batch_per_gpu if self.training else 1 + nms_threshold = self.cfg.rpn_nms_threshold + + list_size = len(rpn_bbox_offsets_list) + + return_rois = [] + + for bid in range(batch_per_gpu): + batch_proposals_list = [] + batch_probs_list = [] + batch_level_list = [] + for l in range(list_size): + # get proposals and probs + offsets = rpn_bbox_offsets_list[l][bid].dimshuffle(2, 3, 0, 1).reshape(-1, 4) + all_anchors = all_anchors_list[l] + proposals = self.box_coder.decode(all_anchors, offsets) + + probs = rpn_cls_prob_list[l][bid, 1].dimshuffle(1, 2, 0).reshape(1, -1) + # prev nms top n + probs, order = F.argsort(probs, descending=True) + num_proposals = F.minimum(probs.shapeof(1), prev_nms_top_n) + probs = probs.reshape(-1)[:num_proposals] + order = order.reshape(-1)[:num_proposals] + proposals = proposals.ai[order, :] + + batch_proposals_list.append(proposals) + batch_probs_list.append(probs) + batch_level_list.append(mge.ones(probs.shapeof(0)) * l) + + proposals = F.concat(batch_proposals_list, axis=0) + scores = F.concat(batch_probs_list, axis=0) + level = F.concat(batch_level_list, axis=0) + + proposals = layers.get_clipped_box(proposals, im_info[bid, :]) + # filter empty + keep_mask = layers.filter_boxes(proposals) + _, keep_inds = F.cond_take(keep_mask == 1, keep_mask) + proposals = proposals.ai[keep_inds, :] + scores = scores.ai[keep_inds] + level = level.ai[keep_inds] + + # gather the proposals and probs + # sort nms by scores + scores, order = F.argsort(scores.reshape(1, -1), descending=True) + order = order.reshape(-1) + proposals = proposals.ai[order, :] + level = level.ai[order] + + # apply total level nms + rois = F.concat([proposals, scores.reshape(-1, 1)], axis=1) + keep_inds = batched_nms(proposals, scores, level, nms_threshold, post_nms_top_n) + rois = rois.ai[keep_inds] + + # rois shape (N, 5), info [batch_id, x1, y1, x2, y2] + batch_inds = mge.ones((rois.shapeof(0), 1)) * bid + batch_rois = F.concat([batch_inds, rois[:, :4]], axis=1) + return_rois.append(batch_rois) + + return F.zero_grad(F.concat(return_rois, axis=0)) + + def merge_rpn_score_box(self, rpn_cls_score_list, rpn_bbox_offsets_list): + final_rpn_cls_score_list = [] + final_rpn_bbox_offsets_list = [] + + for bid in range(self.cfg.batch_per_gpu): + batch_rpn_cls_score_list = [] + batch_rpn_bbox_offsets_list = [] + + for i in range(len(self.in_features)): + rpn_cls_score = rpn_cls_score_list[i][bid] \ + .dimshuffle(2, 3, 1, 0).reshape(-1, 2) + rpn_bbox_offsets = rpn_bbox_offsets_list[i][bid] \ + .dimshuffle(2, 3, 0, 1).reshape(-1, 4) + + batch_rpn_cls_score_list.append(rpn_cls_score) + batch_rpn_bbox_offsets_list.append(rpn_bbox_offsets) + + batch_rpn_cls_score = F.concat(batch_rpn_cls_score_list, axis=0) + batch_rpn_bbox_offsets = F.concat(batch_rpn_bbox_offsets_list, axis=0) + + final_rpn_cls_score_list.append(batch_rpn_cls_score) + final_rpn_bbox_offsets_list.append(batch_rpn_bbox_offsets) + + final_rpn_cls_score = F.concat(final_rpn_cls_score_list, axis=0) + final_rpn_bbox_offsets = F.concat(final_rpn_bbox_offsets_list, axis=0) + return final_rpn_cls_score, final_rpn_bbox_offsets + + def per_level_gt( + self, gt_boxes, im_info, anchors, allow_low_quality_matches=True + ): + ignore_label = self.cfg.ignore_label + # get the gt boxes + valid_gt_boxes = gt_boxes[:im_info[4], :] + # compute the iou matrix + overlaps = layers.get_iou(anchors, valid_gt_boxes[:, :4]) + # match the dtboxes + a_shp0 = anchors.shape[0] + max_overlaps = F.max(overlaps, axis=1) + argmax_overlaps = F.argmax(overlaps, axis=1) + # all ignore + labels = mge.ones(a_shp0).astype("int32") * ignore_label + # set negative ones + labels = labels * (max_overlaps >= self.cfg.rpn_negative_overlap) + # set positive ones + fg_mask = (max_overlaps >= self.cfg.rpn_positive_overlap) + const_one = mge.tensor(1.0) + if allow_low_quality_matches: + # make sure that max iou of gt matched + gt_argmax_overlaps = F.argmax(overlaps, axis=0) + num_valid_boxes = valid_gt_boxes.shapeof(0) + gt_id = F.linspace(0, num_valid_boxes - 1, num_valid_boxes).astype("int32") + argmax_overlaps = argmax_overlaps.set_ai(gt_id)[gt_argmax_overlaps] + max_overlaps = max_overlaps.set_ai( + const_one.broadcast(num_valid_boxes) + )[gt_argmax_overlaps] + fg_mask = (max_overlaps >= self.cfg.rpn_positive_overlap) + # set positive ones + _, fg_mask_ind = F.cond_take(fg_mask == 1, fg_mask) + labels = labels.set_ai(const_one.broadcast(fg_mask_ind.shapeof(0)))[fg_mask_ind] + # compute the targets + bbox_targets = self.box_coder.encode( + anchors, valid_gt_boxes.ai[argmax_overlaps, :4] + ) + return labels, bbox_targets + + def get_ground_truth(self, gt_boxes, im_info, all_anchors_list): + final_labels_list = [] + final_bbox_targets_list = [] + + for bid in range(self.cfg.batch_per_gpu): + batch_labels_list = [] + batch_bbox_targets_list = [] + for anchors in all_anchors_list: + rpn_labels_perlvl, rpn_bbox_targets_perlvl = self.per_level_gt( + gt_boxes[bid], im_info[bid], anchors, + ) + batch_labels_list.append(rpn_labels_perlvl) + batch_bbox_targets_list.append(rpn_bbox_targets_perlvl) + + concated_batch_labels = F.concat(batch_labels_list, axis=0) + concated_batch_bbox_targets = F.concat(batch_bbox_targets_list, axis=0) + + # sample labels + num_positive = self.cfg.num_sample_anchors * self.cfg.positive_anchor_ratio + # sample positive + concated_batch_labels = self._bernoulli_sample_labels( + concated_batch_labels, + num_positive, 1, self.cfg.ignore_label + ) + # sample negative + num_positive = (concated_batch_labels == 1).sum() + num_negative = self.cfg.num_sample_anchors - num_positive + concated_batch_labels = self._bernoulli_sample_labels( + concated_batch_labels, + num_negative, 0, self.cfg.ignore_label + ) + + final_labels_list.append(concated_batch_labels) + final_bbox_targets_list.append(concated_batch_bbox_targets) + final_labels = F.concat(final_labels_list, axis=0) + final_bbox_targets = F.concat(final_bbox_targets_list, axis=0) + return F.zero_grad(final_labels), F.zero_grad(final_bbox_targets) + + def _bernoulli_sample_labels( + self, labels, num_samples, sample_value, ignore_label=-1 + ): + """ Using the bernoulli sampling method""" + sample_label_mask = (labels == sample_value) + num_mask = sample_label_mask.sum() + num_final_samples = F.minimum(num_mask, num_samples) + # here, we use the bernoulli probability to sample the anchors + sample_prob = num_final_samples / num_mask + uniform_rng = rand.uniform(sample_label_mask.shapeof(0)) + to_ignore_mask = (uniform_rng >= sample_prob) * sample_label_mask + labels = labels * (1 - to_ignore_mask) + to_ignore_mask * ignore_label + + return labels diff --git a/official/vision/detection/models/__init__.py b/official/vision/detection/models/__init__.py index ac9cdf6d1288df22ac8a3e71c04bb49b2c051911..be97d0d7e1fdb241c88b7dfadc4eb34ad40e6a9c 100644 --- a/official/vision/detection/models/__init__.py +++ b/official/vision/detection/models/__init__.py @@ -6,6 +6,7 @@ # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +from .faster_rcnn_fpn import * from .retinanet import * _EXCLUDE = {} diff --git a/official/vision/detection/models/faster_rcnn_fpn.py b/official/vision/detection/models/faster_rcnn_fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..7122e3d9ee9e69fb7bc8528eecff414c910b3510 --- /dev/null +++ b/official/vision/detection/models/faster_rcnn_fpn.py @@ -0,0 +1,212 @@ +# -*- coding:utf-8 -*- +# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") +# +# Copyright (c) 2014-2020 Megvii Inc. All rights reserved. +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +import numpy as np +import megengine as mge +import megengine.functional as F +import megengine.module as M + +from official.vision.classification.resnet.model import resnet50 +from official.vision.detection import layers + + +class FasterRCNN(M.Module): + + def __init__(self, cfg, batch_size): + super().__init__() + self.cfg = cfg + cfg.batch_per_gpu = batch_size + self.batch_size = batch_size + # ----------------------- build the backbone ------------------------ # + bottom_up = resnet50(norm=layers.get_norm(cfg.resnet_norm)) + + # ------------ freeze the weights of resnet stage1 and stage 2 ------ # + if self.cfg.backbone_freeze_at >= 1: + for p in bottom_up.conv1.parameters(): + p.requires_grad = False + if self.cfg.backbone_freeze_at >= 2: + for p in bottom_up.layer1.parameters(): + p.requires_grad = False + + # -------------------------- build the FPN -------------------------- # + out_channels = 256 + self.backbone = layers.FPN( + bottom_up=bottom_up, + in_features=["res2", "res3", "res4", "res5"], + out_channels=out_channels, + norm="", + top_block=layers.FPNP6(), + strides=[4, 8, 16, 32], + channels=[256, 512, 1024, 2048], + ) + + # -------------------------- build the RPN -------------------------- # + self.RPN = layers.RPN(cfg) + + # ----------------------- build the RCNN head ----------------------- # + self.RCNN = layers.RCNN(cfg) + + # -------------------------- input Tensor --------------------------- # + self.inputs = { + "image": mge.tensor( + np.random.random([2, 3, 224, 224]).astype(np.float32), dtype="float32", + ), + "im_info": mge.tensor( + np.random.random([2, 5]).astype(np.float32), dtype="float32", + ), + "gt_boxes": mge.tensor( + np.random.random([2, 100, 5]).astype(np.float32), dtype="float32", + ), + } + + def preprocess_image(self, image): + normed_image = ( + image - self.cfg.img_mean[None, :, None, None] + ) / self.cfg.img_std[None, :, None, None] + return layers.get_padded_tensor(normed_image, 32, 0.0) + + def forward(self, inputs): + images = inputs['image'] + im_info = inputs['im_info'] + gt_boxes = inputs['gt_boxes'] + # process the images + normed_images = self.preprocess_image(images) + # normed_images = images + fpn_features = self.backbone(normed_images) + + if self.training: + return self._forward_train(fpn_features, im_info, gt_boxes) + else: + return self.inference(fpn_features, im_info) + + def _forward_train(self, fpn_features, im_info, gt_boxes): + rpn_rois, rpn_losses = self.RPN(fpn_features, im_info, gt_boxes) + rcnn_losses = self.RCNN(fpn_features, rpn_rois, im_info, gt_boxes) + + loss_rpn_cls = rpn_losses['loss_rpn_cls'] + loss_rpn_loc = rpn_losses['loss_rpn_loc'] + loss_rcnn_cls = rcnn_losses['loss_rcnn_cls'] + loss_rcnn_loc = rcnn_losses['loss_rcnn_loc'] + total_loss = loss_rpn_cls + loss_rpn_loc + loss_rcnn_cls + loss_rcnn_loc + + loss_dict = { + "total_loss": total_loss, + "rpn_cls": loss_rpn_cls, + "rpn_loc": loss_rpn_loc, + "rcnn_cls": loss_rcnn_cls, + "rcnn_loc": loss_rcnn_loc + } + self.cfg.losses_keys = list(loss_dict.keys()) + return loss_dict + + def inference(self, fpn_features, im_info): + rpn_rois = self.RPN(fpn_features, im_info) + pred_boxes, pred_score = self.RCNN(fpn_features, rpn_rois) + # pred_score = pred_score[:, None] + pred_boxes = pred_boxes.reshape(-1, 4) + scale_w = im_info[0, 1] / im_info[0, 3] + scale_h = im_info[0, 0] / im_info[0, 2] + pred_boxes = pred_boxes / F.concat( + [scale_w, scale_h, scale_w, scale_h], axis=0 + ) + + clipped_boxes = layers.get_clipped_box( + pred_boxes, im_info[0, 2:4] + ).reshape(-1, self.cfg.num_classes, 4) + return pred_score, clipped_boxes + + +class FasterRCNNConfig: + + def __init__(self): + self.resnet_norm = "FrozenBN" + self.backbone_freeze_at = 2 + + # ------------------------ data cfg --------------------------- # + self.train_dataset = dict( + name="coco", + root="train2017", + ann_file="annotations/instances_train2017.json", + ) + self.test_dataset = dict( + name="coco", + root="val2017", + ann_file="annotations/instances_val2017.json", + ) + self.num_classes = 80 + + self.img_mean = np.array([103.530, 116.280, 123.675]) # BGR + self.img_std = np.array([57.375, 57.120, 58.395]) + + # ----------------------- rpn cfg ------------------------- # + self.anchor_base_size = 16 + self.anchor_scales = np.array([0.5]) + self.anchor_aspect_ratios = [0.5, 1, 2] + self.anchor_offset = -0.5 + self.num_cell_anchors = len(self.anchor_aspect_ratios) + + self.bbox_normalize_means = None + self.bbox_normalize_stds = np.array([0.1, 0.1, 0.2, 0.2]) + + self.rpn_stride = np.array([4, 8, 16, 32, 64]).astype(np.float32) + self.rpn_in_features = ["p2", "p3", "p4", "p5", "p6"] + self.rpn_channel = 256 + + self.rpn_nms_threshold = 0.7 + self.allow_low_quality = True + self.num_sample_anchors = 256 + self.positive_anchor_ratio = 0.5 + self.rpn_positive_overlap = 0.7 + self.rpn_negative_overlap = 0.3 + self.ignore_label = -1 + + # ----------------------- rcnn cfg ------------------------- # + self.pooling_method = 'roi_align' + self.pooling_size = (7, 7) + + self.num_rois = 512 + self.fg_ratio = 0.5 + self.fg_threshold = 0.5 + self.bg_threshold_high = 0.5 + self.bg_threshold_low = 0.0 + + self.rcnn_in_features = ["p2", "p3", "p4", "p5"] + self.rcnn_stride = [4, 8, 16, 32] + + # ------------------------ loss cfg -------------------------- # + self.rpn_smooth_l1_beta = 3 + self.rcnn_smooth_l1_beta = 1 + + # ------------------------ training cfg ---------------------- # + self.train_image_short_size = 800 + self.train_image_max_size = 1333 + self.train_prev_nms_top_n = 2000 + self.train_post_nms_top_n = 1000 + + self.num_losses = 5 + self.basic_lr = 0.02 / 16.0 # The basic learning rate for single-image + self.momentum = 0.9 + self.weight_decay = 1e-4 + self.log_interval = 20 + self.nr_images_epoch = 80000 + self.max_epoch = 18 + self.warm_iters = 500 + self.lr_decay_rate = 0.1 + self.lr_decay_sates = [12, 16, 17] + + # ------------------------ testing cfg ------------------------- # + self.test_image_short_size = 800 + self.test_image_max_size = 1333 + self.test_prev_nms_top_n = 1000 + self.test_post_nms_top_n = 1000 + self.test_max_boxes_per_image = 100 + + self.test_vis_threshold = 0.3 + self.test_cls_threshold = 0.05 + self.test_nms = 0.5 + self.class_aware_box = True diff --git a/official/vision/detection/models/retinanet.py b/official/vision/detection/models/retinanet.py index 1013819788d75d75b9500dcfa7eeb85680526dde..eb5f07e3ac84f4d6ba563f94a578265717e51a46 100644 --- a/official/vision/detection/models/retinanet.py +++ b/official/vision/detection/models/retinanet.py @@ -123,7 +123,13 @@ class RetinaNet(M.Module): ) total = rpn_cls_loss + rpn_bbox_loss - return total, rpn_cls_loss, rpn_bbox_loss + loss_dict = { + "total_loss": total, + "loss_cls": rpn_cls_loss, + "loss_loc": rpn_bbox_loss + } + self.cfg.losses_keys = list(loss_dict.keys()) + return loss_dict else: # currently not support multi-batch testing assert self.batch_size == 1 @@ -231,6 +237,7 @@ class RetinaNetConfig: self.focal_loss_alpha = 0.25 self.focal_loss_gamma = 2 self.reg_loss_weight = 1.0 / 4.0 + self.num_losses = 3 # ------------------------ training cfg ---------------------- # self.basic_lr = 0.01 / 16.0 # The basic learning rate for single-image diff --git a/official/vision/detection/retinanet_res50_coco_1x_800size.py b/official/vision/detection/retinanet_res50_coco_1x_800size.py index 8324290dd5cb48678fd2583980ddc4df88dd6656..9c06d55d280a76565cb630d6e40cb3454f5022bb 100644 --- a/official/vision/detection/retinanet_res50_coco_1x_800size.py +++ b/official/vision/detection/retinanet_res50_coco_1x_800size.py @@ -19,6 +19,8 @@ def retinanet_res50_coco_1x_800size(batch_size=1, **kwargs): r""" RetinaNet trained from COCO dataset. `"RetinaNet" `_ + `"FPN" `_ + `"COCO" `_ """ return models.RetinaNet(models.RetinaNetConfig(), batch_size=batch_size, **kwargs) diff --git a/official/vision/detection/retinanet_res50_coco_1x_800size_syncbn.py b/official/vision/detection/retinanet_res50_coco_1x_800size_syncbn.py index 363a542ed2268a83eeabca889167e4be8730bb17..594439506628ffa3e02ec3969e52885657be15f8 100644 --- a/official/vision/detection/retinanet_res50_coco_1x_800size_syncbn.py +++ b/official/vision/detection/retinanet_res50_coco_1x_800size_syncbn.py @@ -6,7 +6,6 @@ # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -from megengine import hub from official.vision.detection import models @@ -24,6 +23,9 @@ def retinanet_res50_coco_1x_800size_syncbn(batch_size=1, **kwargs): r""" RetinaNet with SyncBN trained from COCO dataset. `"RetinaNet" `_ + `"FPN" `_ + `"COCO" `_ + `"SyncBN" `_ """ return models.RetinaNet(CustomRetinaNetConfig(), batch_size=batch_size, **kwargs) diff --git a/official/vision/detection/retinanet_res50_objects365_1x_800size.py b/official/vision/detection/retinanet_res50_objects365_1x_800size.py index 951d09a9cecc44ae9122ec12ee50511cc1c25d8b..2b5397856dfcee1315dc20eba9f268730088e9ca 100644 --- a/official/vision/detection/retinanet_res50_objects365_1x_800size.py +++ b/official/vision/detection/retinanet_res50_objects365_1x_800size.py @@ -6,8 +6,6 @@ # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -from megengine import hub - from official.vision.detection import models diff --git a/official/vision/detection/tools/gpu_nms.py b/official/vision/detection/tools/gpu_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..cf8422917565721b8a89114e2252bd27744b2c0e --- /dev/null +++ b/official/vision/detection/tools/gpu_nms.py @@ -0,0 +1,98 @@ +#!/usr/bin/env mdl +# This file will seal the nms opr within a better way than lib_nms +import ctypes +import os +import struct + +import numpy as np +import megengine as mge +import megengine.functional as F +from megengine._internal.craniotome import CraniotomeBase +from megengine.core.tensor import wrap_io_tensor + +_so_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'lib_nms.so') +_so_lib = ctypes.CDLL(_so_path) + +_TYPE_POINTER = ctypes.c_void_p +_TYPE_POINTER = ctypes.c_void_p +_TYPE_INT = ctypes.c_int32 +_TYPE_FLOAT = ctypes.c_float + +_so_lib.NMSForwardGpu.argtypes = [ + _TYPE_POINTER, + _TYPE_POINTER, + _TYPE_POINTER, + _TYPE_POINTER, + _TYPE_FLOAT, + _TYPE_INT, + _TYPE_POINTER, +] +_so_lib.NMSForwardGpu.restype = _TYPE_INT + +_so_lib.CreateHostDevice.restype = _TYPE_POINTER + + +class NMSCran(CraniotomeBase): + __nr_inputs__ = 1 + __nr_outputs__ = 3 + + def setup(self, iou_threshold, max_output): + self._iou_threshold = iou_threshold + self._max_output = max_output + # Load the necessary host device + self._host_device = _so_lib.CreateHostDevice() + + def execute(self, inputs, outputs): + box_tensor_ptr = inputs[0].pubapi_dev_tensor_ptr + output_tensor_ptr = outputs[0].pubapi_dev_tensor_ptr + output_num_tensor_ptr = outputs[1].pubapi_dev_tensor_ptr + mask_tensor_ptr = outputs[2].pubapi_dev_tensor_ptr + + _so_lib.NMSForwardGpu( + box_tensor_ptr, mask_tensor_ptr, + output_tensor_ptr, output_num_tensor_ptr, + self._iou_threshold, self._max_output, + self._host_device + ) + + def grad(self, wrt_idx, inputs, outputs, out_grad): + return 0 + + def init_output_dtype(self, input_dtypes): + return [np.int32, np.int32, np.int32] + + def get_serialize_params(self): + return ('nms', struct.pack('fi', self._iou_threshold, self._max_output)) + + def infer_shape(self, inp_shapes): + nr_box = inp_shapes[0][0] + threadsPerBlock = 64 + output_size = nr_box + # here we compute the number of int32 used in mask_outputs. + # In original version, we compute the bytes only. + mask_size = int( + nr_box * ( + nr_box // threadsPerBlock + int((nr_box % threadsPerBlock) > 0) + ) * 8 / 4 + ) + return [[output_size], [1], [mask_size]] + + +@wrap_io_tensor +def gpu_nms(box, iou_threshold, max_output): + keep, num, _ = NMSCran.make(box, iou_threshold=iou_threshold, max_output=max_output) + return keep[:num] + + +def batched_nms(boxes, scores, idxs, iou_threshold, num_keep, use_offset=False): + if use_offset: + boxes_offset = mge.tensor( + [0, 0, 1, 1], device=boxes.device + ).reshape(1, 4).broadcast(boxes.shapeof(0), 4) + boxes = boxes - boxes_offset + max_coordinate = boxes.max() + offsets = idxs * (max_coordinate + 1) + boxes_for_nms = boxes + offsets.reshape(-1, 1).broadcast(boxes.shapeof(0), 4) + boxes_with_scores = F.concat([boxes_for_nms, scores.reshape(-1, 1)], axis=1) + keep_inds = gpu_nms(boxes_with_scores, iou_threshold, num_keep) + return keep_inds diff --git a/official/vision/detection/tools/gpu_nms/nms.cu b/official/vision/detection/tools/gpu_nms/nms.cu new file mode 100644 index 0000000000000000000000000000000000000000..c89e2c8835c8c605cd1da83ce93ee2bf9fb43138 --- /dev/null +++ b/official/vision/detection/tools/gpu_nms/nms.cu @@ -0,0 +1,201 @@ +#include "megbrain_pubapi.h" +#include +#include +#include + +#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) +#define CUDA_CHECK(condition) \ + /* Code block avoids redefinition of cudaError_t error */ \ + do { \ + cudaError_t error = condition; \ + if (error != cudaSuccess) { \ + std::cout << " " << cudaGetErrorString(error); \ + } \ + } while (0) +#define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError()) + +int const threadsPerBlock = sizeof(unsigned long long) * 8; // 64 + +__device__ inline float devIoU(float const * const a, float const * const b) { + float left = max(a[0], b[0]), right = min(a[2], b[2]); + float top = max(a[1], b[1]), bottom = min(a[3], b[3]); + float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); + float interS = width * height; + float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); + float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); + return interS / (Sa + Sb - interS); +} + +__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, + const float *dev_boxes, unsigned long long *dev_mask) { + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + if (row_start > col_start) return; + + const int row_size = + min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); + const int col_size = + min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); + + __shared__ float block_boxes[threadsPerBlock * 5]; + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 5 + 0] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; + block_boxes[threadIdx.x * 5 + 1] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; + block_boxes[threadIdx.x * 5 + 2] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; + block_boxes[threadIdx.x * 5 + 3] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; + block_boxes[threadIdx.x * 5 + 4] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; + const float *cur_box = dev_boxes + cur_box_idx * 5; + int i = 0; + unsigned long long t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { + t |= 1ULL << i; + } + } + const int col_blocks = DIVUP(n_boxes, threadsPerBlock); + dev_mask[cur_box_idx * col_blocks + col_start] = t; + } +} + +template +static inline void cpu_unroll_for(unsigned long long *dst, const unsigned long long *src, int n) { + int nr_out = (n - n % unroll) / unroll; + for (int i = 0; i < nr_out; ++i) { +#pragma unroll + for (int j = 0; j < unroll; ++j) { + *(dst++) |= *(src++); + } + } + for (int j = 0; j < n % unroll; ++j) { + *(dst++) |= *(src++); + } +} + +using std::vector; +// const int nr_init_box = 8000; +// vector _mask_host(nr_init_box * (nr_init_box / threadsPerBlock)); +// vector _remv(nr_init_box / threadsPerBlock); +// vector _keep_out(nr_init_box); + +// NOTE: If we directly use this lib in nmp.py, we will meet the same _mask_host and other +// objects, which is not safe for multi-processing programs. + +class HostDevice{ +protected: + static const int nr_init_box = 8000; +public: + vector mask_host; + vector remv; + vector keep_out; + + HostDevice(): mask_host(nr_init_box * (nr_init_box / threadsPerBlock)), remv(nr_init_box / threadsPerBlock), keep_out(nr_init_box){} +}; + +extern "C"{ + using MGBDevTensor = mgb::pubapi::DeviceTensor; + using std::cout; + + void * CreateHostDevice(){ + return new HostDevice(); + } + + int NMSForwardGpu(void* box_ptr, void* mask_ptr, void* output_ptr, void* output_num_ptr, float iou_threshold, int max_output, void* host_device_ptr){ + auto box_tensor = mgb::pubapi::as_versioned_obj(box_ptr); + auto mask_tensor= mgb::pubapi::as_versioned_obj(mask_ptr); + auto output_tensor = mgb::pubapi::as_versioned_obj(output_ptr); + auto output_num_tensor = mgb::pubapi::as_versioned_obj(output_num_ptr); + + // auto cuda_stream = static_cast (box_tensor->desc.cuda_ctx.stream); + auto cuda_stream = static_cast (output_tensor->desc.cuda_ctx.stream); + // assert(box_tensor->desc.shape[0] == output_tensor->desc.shape[0]); + + // cout << "box_tensor.ndim: " << box_tensor->desc.ndim << "\n"; + // cout << "box_tensor.shape_0: " << box_tensor->desc.shape[0] << "\n"; + // cout << "box_tensor.shape_1: " << box_tensor->desc.shape[1] << "\n"; + int box_num = box_tensor->desc.shape[0]; + int box_dim = box_tensor->desc.shape[1]; + assert(box_dim == 5); + + const int col_blocks = DIVUP(box_num, threadsPerBlock); + // cout << "mask_dev size: " << box_num * col_blocks * sizeof(unsigned long long) << "\n"; + // cout << "mask_ptr size: " << mask_tensor->desc.shape[0] * sizeof(int) << "\n"; + // cout << "mask shape : " << mask_tensor->desc.shape[0] << "\n"; + + dim3 blocks(DIVUP(box_num, threadsPerBlock), DIVUP(box_num, threadsPerBlock)); + // dim3 blocks(col_blocks, col_blocks); + dim3 threads(threadsPerBlock); + // cout << "sizeof unsigned long long " << sizeof(unsigned long long) << "\n"; + float* dev_box = static_cast (box_tensor->desc.dev_ptr); + unsigned long long* dev_mask = static_cast (mask_tensor->desc.dev_ptr); + int * dev_output = static_cast (output_tensor->desc.dev_ptr); + + CUDA_CHECK(cudaMemsetAsync(dev_mask, 0, mask_tensor->desc.shape[0] * sizeof(int), cuda_stream)); + // CUDA_CHECK(cudaMemsetAsync(dev_output, 0, output_tensor->desc.shape[0] * sizeof(int), cuda_stream)); + nms_kernel<<>>(box_num, iou_threshold, dev_box, dev_mask); + // cudaDeviceSynchronize(); + + // get the host device vectors + HostDevice* host_device = static_cast(host_device_ptr); + vector& _mask_host = host_device->mask_host; + vector& _remv = host_device->remv; + vector& _keep_out = host_device->keep_out; + + + int current_mask_host_size = box_num * col_blocks; + if(_mask_host.capacity() < current_mask_host_size){ + _mask_host.reserve(current_mask_host_size); + } + CUDA_CHECK(cudaMemcpyAsync(&_mask_host[0], dev_mask, sizeof(unsigned long long) * box_num * col_blocks, cudaMemcpyDeviceToHost, cuda_stream)); + // cout << "\n m_host site: " << static_cast (&_mask_host[0]) << "\n"; + + if(_remv.capacity() < col_blocks){ + _remv.reserve(col_blocks); + } + if(_keep_out.capacity() < box_num){ + _keep_out.reserve(box_num); + } + if(max_output < 0){ + max_output = box_num; + } + memset(&_remv[0], 0, sizeof(unsigned long long) * col_blocks); + CUDA_CHECK(cudaStreamSynchronize(cuda_stream)); + + // do the cpu reduce + int num_to_keep = 0; + for (int i = 0; i < box_num; i++) { + int nblock = i / threadsPerBlock; + int inblock = i % threadsPerBlock; + + if (!(_remv[nblock] & (1ULL << inblock))) { + _keep_out[num_to_keep++] = i; + if(num_to_keep == max_output){ + break; + } + // NOTE: here we need add nblock to pointer p + unsigned long long *p = &_mask_host[0] + i * col_blocks + nblock; + unsigned long long *q = &_remv[0] + nblock; + cpu_unroll_for(q, p, col_blocks - nblock); + } + } + CUDA_CHECK(cudaMemcpyAsync(dev_output, &_keep_out[0], num_to_keep * sizeof(int), cudaMemcpyHostToDevice, cuda_stream)); + int* dev_output_num = static_cast(output_num_tensor->desc.dev_ptr); + CUDA_CHECK(cudaMemcpyAsync(dev_output_num, &num_to_keep, sizeof(int), cudaMemcpyHostToDevice, cuda_stream)); + // CUDA_CHECK(cudaStreamSynchronize(cuda_stream)); + return num_to_keep; + } +} diff --git a/official/vision/detection/tools/train.py b/official/vision/detection/tools/train.py index f93093d4ddcb4a4db71b4fc26b9542271050e7f3..a65bdbc5e2f8211ab59934e6c7eff55859908b54 100644 --- a/official/vision/detection/tools/train.py +++ b/official/vision/detection/tools/train.py @@ -128,11 +128,12 @@ def adjust_learning_rate(optimizer, epoch_id, step, model, world_size): def train_one_epoch(model, data_queue, opt, tot_steps, rank, epoch_id, world_size): @jit.trace(symbolic=True, opt_level=2) def propagate(): - loss_list = model(model.inputs) - opt.backward(loss_list[0]) - return loss_list + loss_dict = model(model.inputs) + opt.backward(loss_dict["total_loss"]) + losses = list(loss_dict.values()) + return losses - meter = AverageMeter(record_len=3) + meter = AverageMeter(record_len=model.cfg.num_losses) log_interval = model.cfg.log_interval for step in range(tot_steps): adjust_learning_rate(opt, epoch_id, step, model, world_size) @@ -146,17 +147,18 @@ def train_one_epoch(model, data_queue, opt, tot_steps, rank, epoch_id, world_siz opt.step() if rank == 0: + loss_str = ", ".join(["{}:%f".format(loss) for loss in model.cfg.losses_keys]) + log_info_str = "e%d, %d/%d, lr:%f, " + loss_str meter.update([loss.numpy() for loss in loss_list]) if step % log_interval == 0: average_loss = meter.average() logger.info( - "e%d, %d/%d, lr:%f, cls:%f, loc:%f", + log_info_str, epoch_id, step, tot_steps, opt.param_groups[0]["lr"], - average_loss[1], - average_loss[2], + *average_loss, ) meter.reset()