diff --git a/configs/_base_/models/mask_rcnn_r50.yml b/configs/_base_/models/mask_rcnn_r50.yml new file mode 100644 index 0000000000000000000000000000000000000000..7013ecfdc2caba559dfa158a511757e8e22d203c --- /dev/null +++ b/configs/_base_/models/mask_rcnn_r50.yml @@ -0,0 +1,114 @@ +architecture: MaskRCNN +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar +weights: output/mask_rcnn_r50_fpn_1x/model_final +load_static_weights: True + +# Model Achitecture +MaskRCNN: + # model anchor info flow + anchor: Anchor + proposal: Proposal + mask: Mask + # model feat info flow + backbone: ResNet + rpn_head: RPNHead + bbox_head: BBoxHead + mask_head: MaskHead + # post process + bbox_post_process: BBoxPostProcess + mask_post_process: MaskPostProcess + +ResNet: + # index 0 stands for res2 + depth: 50 + norm_type: bn + freeze_at: 0 + return_idx: [2] + num_stages: 3 + +RPNHead: + rpn_feat: + name: RPNFeat + feat_in: 1024 + feat_out: 1024 + anchor_per_position: 15 + +Anchor: + anchor_generator: + name: AnchorGeneratorRPN + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_target_generator: + name: AnchorTargetGeneratorRPN + batch_size_per_im: 256 + fg_fraction: 0.5 + negative_overlap: 0.3 + positive_overlap: 0.7 + straddle_thresh: 0.0 + +Proposal: + proposal_generator: + name: ProposalGenerator + min_size: 0.0 + nms_thresh: 0.7 + train_pre_nms_top_n: 12000 + train_post_nms_top_n: 2000 + infer_pre_nms_top_n: 6000 + infer_post_nms_top_n: 1000 + proposal_target_generator: + name: ProposalTargetGenerator + batch_size_per_im: 512 + bbox_reg_weights: [[0.1, 0.1, 0.2, 0.2],] + bg_thresh_hi: [0.5,] + bg_thresh_lo: [0.0,] + fg_thresh: [0.5,] + fg_fraction: 0.25 + +BBoxHead: + bbox_feat: + name: BBoxFeat + roi_extractor: RoIAlign + head_feat: + name: Res5Head + feat_in: 1024 + feat_out: 512 + with_pool: true + in_feat: 2048 + +BBoxPostProcess: + decode: + name: RCNNBox + num_classes: 81 + batch_size: 1 + nms: + name: MultiClassNMS + keep_top_k: 100 + score_threshold: 0.05 + nms_threshold: 0.5 + +Mask: + mask_target_generator: + name: MaskTargetGenerator + mask_resolution: 14 + +RoIAlign: + resolution: 14 + sampling_ratio: 0 + start_level: 0 + end_level: 0 + +MaskHead: + mask_feat: + name: MaskFeat + num_convs: 0 + feat_in: 2048 + feat_out: 256 + mask_roi_extractor: RoIAlign + share_bbox_feat: true + feat_in: 256 + + +MaskPostProcess: + mask_resolution: 14 diff --git a/configs/_base_/readers/mask_fpn_reader.yml b/configs/_base_/readers/mask_fpn_reader.yml new file mode 100644 index 0000000000000000000000000000000000000000..ba34948f18d549bb8cbb9214b940f60ae86c324e --- /dev/null +++ b/configs/_base_/readers/mask_fpn_reader.yml @@ -0,0 +1,46 @@ +worker_num: 2 +TrainReader: + inputs_def: + fields: ['image', 'im_info', 'im_id', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly'] + sample_transforms: + - DecodeImage: {to_rgb: true} + - RandomFlipImage: {prob: 0.5, is_mask_flip: true} + - NormalizeImage: {is_channel_first: false, is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - ResizeImage: {target_size: 800, max_size: 1333, interp: 1, use_cv2: true} + - Permute: {to_bgr: false, channel_first: true} + batch_transforms: + - PadBatch: {pad_to_stride: 32, use_padded_im_info: false, pad_gt: true} + batch_size: 1 + shuffle: true + drop_last: true + + +EvalReader: + inputs_def: + fields: ['image', 'im_shape', 'scale_factor', 'im_id'] + sample_transforms: + - DecodeOp: {} + - NormalizeImageOp: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - ResizeOp: {interp: 1, target_size: [800, 1333]} + - PermuteOp: {} + batch_transforms: + - PadBatchOp: {pad_to_stride: 32, pad_gt: false} + batch_size: 1 + shuffle: false + drop_last: false + drop_empty: false + + +TestReader: + inputs_def: + fields: ['image', 'im_shape', 'scale_factor', 'im_id'] + sample_transforms: + - DecodeOp: {} + - NormalizeImageOp: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - ResizeOp: {interp: 1, target_size: [800, 1333]} + - PermuteOp: {} + batch_transforms: + - PadBatchOp: {pad_to_stride: 32, pad_gt: false} + batch_size: 1 + shuffle: false + drop_last: false diff --git a/configs/_base_/readers/mask_reader.yml b/configs/_base_/readers/mask_reader.yml index c7296653e7a64da4bb3bb7e82e151c253ff1e1e7..0c8bbd4b5934b03e1af55d54ada61762893bcfc0 100644 --- a/configs/_base_/readers/mask_reader.yml +++ b/configs/_base_/readers/mask_reader.yml @@ -9,7 +9,7 @@ TrainReader: - ResizeImage: {target_size: 800, max_size: 1333, interp: 1, use_cv2: true} - Permute: {to_bgr: false, channel_first: true} batch_transforms: - - PadBatch: {pad_to_stride: 32, use_padded_im_info: false, pad_gt: true} + - PadBatch: {pad_to_stride: -1., use_padded_im_info: false, pad_gt: true} batch_size: 1 shuffle: true drop_last: true @@ -24,7 +24,7 @@ EvalReader: - ResizeOp: {interp: 1, target_size: [800, 1333]} - PermuteOp: {} batch_transforms: - - PadBatchOp: {pad_to_stride: 32, pad_gt: false} + - PadBatchOp: {pad_to_stride: -1., pad_gt: false} batch_size: 1 shuffle: false drop_last: false @@ -33,14 +33,15 @@ EvalReader: TestReader: inputs_def: - fields: ['image', 'im_info', 'im_id'] + fields: ['image', 'im_shape', 'scale_factor', 'im_id'] sample_transforms: - - DecodeImage: {to_rgb: true, with_mixup: false} - - NormalizeImage: {is_channel_first: false, is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} - - ResizeImage: {interp: 1, max_size: 1333, target_size: 800, use_cv2: true} - - Permute: {channel_first: true, to_bgr: false} + - DecodeOp: {} + - NormalizeImageOp: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - ResizeOp: {interp: 1, target_size: [800, 1333]} + - PermuteOp: {} batch_transforms: - - PadBatch: {pad_to_stride: 32, use_padded_im_info: false, pad_gt: false} + - PadBatchOp: {pad_to_stride: -1., pad_gt: false} batch_size: 1 shuffle: false drop_last: false + drop_empty: false diff --git a/configs/mask_rcnn_r50_1x_coco.yml b/configs/mask_rcnn_r50_1x_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..4d7eaaae9804d4e573d4cb718978b149b45e1f00 --- /dev/null +++ b/configs/mask_rcnn_r50_1x_coco.yml @@ -0,0 +1,7 @@ +_BASE_: [ + './_base_/models/mask_rcnn_r50.yml', + './_base_/optimizers/rcnn_1x.yml', + './_base_/datasets/coco.yml', + './_base_/readers/mask_reader.yml', + './_base_/runtime.yml', +] diff --git a/configs/mask_rcnn_r50_fpn_1x_coco.yml b/configs/mask_rcnn_r50_fpn_1x_coco.yml index c99ec39932145c81f9e58720a18ef5351e307801..3332b43f30515f20792cd91f5992f3384aeafb94 100644 --- a/configs/mask_rcnn_r50_fpn_1x_coco.yml +++ b/configs/mask_rcnn_r50_fpn_1x_coco.yml @@ -2,6 +2,6 @@ _BASE_: [ './_base_/models/mask_rcnn_r50_fpn.yml', './_base_/optimizers/rcnn_1x.yml', './_base_/datasets/coco.yml', - './_base_/readers/mask_reader.yml', + './_base_/readers/mask_fpn_reader.yml', './_base_/runtime.yml', ] diff --git a/ppdet/modeling/architecture/mask_rcnn.py b/ppdet/modeling/architecture/mask_rcnn.py index 76b17998d2a6f2a6f2e1ef3e22f7a681170845c3..34343a7c096a749d3715e132e37b3e70c0d8ff91 100644 --- a/ppdet/modeling/architecture/mask_rcnn.py +++ b/ppdet/modeling/architecture/mask_rcnn.py @@ -65,7 +65,7 @@ class MaskRCNN(BaseArch): def model_arch(self): # Backbone body_feats = self.backbone(self.inputs) - spatial_scale = None + spatial_scale = 1. / 16 # Neck if self.neck is not None: @@ -87,8 +87,8 @@ class MaskRCNN(BaseArch): # compute targets here when training rois = self.proposal(self.inputs, self.rpn_head_out, self.anchor_out) # BBox Head - bbox_feat, self.bbox_head_out = self.bbox_head(body_feats, rois, - spatial_scale) + bbox_feat, self.bbox_head_out, self.bbox_head_feat_func = self.bbox_head( + body_feats, rois, spatial_scale) rois_has_mask_int32 = None if self.inputs['mode'] == 'infer': @@ -106,9 +106,9 @@ class MaskRCNN(BaseArch): bbox_targets) # Mask Head - self.mask_head_out = self.mask_head(self.inputs, body_feats, - self.bboxes, bbox_feat, - rois_has_mask_int32, spatial_scale) + self.mask_head_out = self.mask_head( + self.inputs, body_feats, self.bboxes, bbox_feat, + rois_has_mask_int32, spatial_scale, self.bbox_head_feat_func) def get_loss(self, ): loss = {} diff --git a/ppdet/modeling/head/bbox_head.py b/ppdet/modeling/head/bbox_head.py index 6b5b8013525084a60ac5b99ad4800c7ff0018e41..dbccd4c58b9f371df0f5187c91b66d5b1e6d38f4 100644 --- a/ppdet/modeling/head/bbox_head.py +++ b/ppdet/modeling/head/bbox_head.py @@ -22,6 +22,9 @@ from paddle.regularizer import L2Decay from ppdet.core.workspace import register from ppdet.modeling import ops +from ..backbone.name_adapter import NameAdapter +from ..backbone.resnet import Blocks + @register class TwoFCHead(nn.Layer): @@ -74,6 +77,23 @@ class TwoFCHead(nn.Layer): return fc7_relu +@register +class Res5Head(nn.Layer): + def __init__(self, feat_in=1024, feat_out=512): + super(Res5Head, self).__init__() + na = NameAdapter(self) + self.res5_conv = [] + self.res5 = self.add_sublayer( + 'res5_roi_feat', + Blocks( + feat_in, feat_out, count=3, name_adapter=na, stage_num=5)) + self.feat_out = feat_out * 4 + + def forward(self, roi_feat, stage=0): + y = self.res5(roi_feat) + return y + + @register class BBoxFeat(nn.Layer): __inject__ = ['roi_extractor', 'head_feat'] @@ -86,7 +106,7 @@ class BBoxFeat(nn.Layer): def forward(self, body_feats, rois, spatial_scale, stage=0): rois_feat = self.roi_extractor(body_feats, rois, spatial_scale) bbox_feat = self.head_feat(rois_feat, stage) - return bbox_feat + return bbox_feat, self.head_feat @register @@ -139,15 +159,19 @@ class BBoxHead(nn.Layer): self.bbox_delta_list.append(bbox_delta) def forward(self, body_feats, rois, spatial_scale, stage=0): - bbox_feat = self.bbox_feat(body_feats, rois, spatial_scale, stage) - if self.with_pool: - bbox_feat = F.pool2d( - bbox_feat, pool_type='avg', global_pooling=True) + bbox_feat, head_feat_func = self.bbox_feat(body_feats, rois, + spatial_scale, stage) bbox_head_out = [] - scores = self.bbox_score_list[stage](bbox_feat) - deltas = self.bbox_delta_list[stage](bbox_feat) + if self.with_pool: + bbox_feat_ = F.adaptive_avg_pool2d(bbox_feat, output_size=1) + bbox_feat_ = paddle.squeeze(bbox_feat_, axis=[2, 3]) + scores = self.bbox_score_list[stage](bbox_feat_) + deltas = self.bbox_delta_list[stage](bbox_feat_) + else: + scores = self.bbox_score_list[stage](bbox_feat) + deltas = self.bbox_delta_list[stage](bbox_feat) bbox_head_out.append((scores, deltas)) - return bbox_feat, bbox_head_out + return bbox_feat, bbox_head_out, head_feat_func def _get_head_loss(self, score, delta, target): # bbox cls diff --git a/ppdet/modeling/head/mask_head.py b/ppdet/modeling/head/mask_head.py index 7db51eb186d80a8fa8c30857f926f9b2c150dc96..f57a98de0a202845ff2edec4821ef6e19b743d81 100644 --- a/ppdet/modeling/head/mask_head.py +++ b/ppdet/modeling/head/mask_head.py @@ -28,8 +28,8 @@ class MaskFeat(Layer): __inject__ = ['mask_roi_extractor'] def __init__(self, - mask_roi_extractor, - num_convs=1, + mask_roi_extractor=None, + num_convs=0, feat_in=2048, feat_out=256, mask_num_stages=1, @@ -82,12 +82,16 @@ class MaskFeat(Layer): bbox_feat, mask_index, spatial_scale, - stage=0): - if self.share_bbox_feat: + stage=0, + bbox_head_feat_func=None, + mode='train'): + if self.share_bbox_feat and mask_index: rois_feat = paddle.gather(bbox_feat, mask_index) else: rois_feat = self.mask_roi_extractor(body_feats, bboxes, spatial_scale) + if bbox_head_feat_func is not None and mode == 'infer': + rois_feat = bbox_head_feat_func(rois_feat) # upsample mask_feat = self.upsample_module[stage](rois_feat) return mask_feat @@ -131,8 +135,14 @@ class MaskHead(Layer): spatial_scale, stage=0): # feat - mask_feat = self.mask_feat(body_feats, bboxes, bbox_feat, mask_index, - spatial_scale, stage) + mask_feat = self.mask_feat( + body_feats, + bboxes, + bbox_feat, + mask_index, + spatial_scale, + stage, + mode='train') # logits mask_head_out = self.mask_fcn_logits[stage](mask_feat) return mask_head_out @@ -144,7 +154,8 @@ class MaskHead(Layer): bbox_feat, mask_index, spatial_scale, - stage=0): + stage=0, + bbox_head_feat_func=None): bbox, bbox_num = bboxes if bbox.shape[0] == 0: mask_head_out = bbox @@ -155,11 +166,18 @@ class MaskHead(Layer): scale_factor_list.append(scale_factor[idx, 0]) scale_factor_list = paddle.cast( paddle.concat(scale_factor_list), 'float32') - scaled_bbox = paddle.multiply( - bbox[:, 2:], scale_factor_list, axis=0) + scale_factor_list = paddle.reshape(scale_factor_list, shape=[-1, 1]) + scaled_bbox = paddle.multiply(bbox[:, 2:], scale_factor_list) scaled_bboxes = (scaled_bbox, bbox_num) - mask_feat = self.mask_feat(body_feats, scaled_bboxes, bbox_feat, - mask_index, spatial_scale, stage) + mask_feat = self.mask_feat( + body_feats, + scaled_bboxes, + bbox_feat, + mask_index, + spatial_scale, + stage, + bbox_head_feat_func, + mode='infer') mask_logit = self.mask_fcn_logits[stage](mask_feat) mask_head_out = F.sigmoid(mask_logit) return mask_head_out @@ -171,15 +189,16 @@ class MaskHead(Layer): bbox_feat, mask_index, spatial_scale, + bbox_head_feat_func=None, stage=0): if inputs['mode'] == 'train': mask_head_out = self.forward_train(body_feats, bboxes, bbox_feat, mask_index, spatial_scale, stage) else: scale_factor = inputs['scale_factor'] - mask_head_out = self.forward_test(scale_factor, body_feats, bboxes, - bbox_feat, mask_index, - spatial_scale, stage) + mask_head_out = self.forward_test( + scale_factor, body_feats, bboxes, bbox_feat, mask_index, + spatial_scale, stage, bbox_head_feat_func) return mask_head_out def get_loss(self, mask_head_out, mask_target): diff --git a/ppdet/modeling/layers.py b/ppdet/modeling/layers.py index 3fe9d1d7e40feee21ea4cdfd15ffceee2aa2cb32..ac52db5b7f30d678d970b7685a07669ee76dccb2 100644 --- a/ppdet/modeling/layers.py +++ b/ppdet/modeling/layers.py @@ -295,21 +295,13 @@ class RCNNBox(object): box_normalized=self.box_normalized, axis=self.axis) # TODO: Updata box_clip - origin_h = origin_shape[:, 0] - 1 - origin_w = origin_shape[:, 1] - 1 + origin_h = paddle.unsqueeze(origin_shape[:, 0] - 1, axis=1) + origin_w = paddle.unsqueeze(origin_shape[:, 1] - 1, axis=1) zeros = paddle.zeros(origin_h.shape, 'float32') - x1 = paddle.maximum( - paddle.minimum( - bbox[:, :, 0], origin_w, axis=0), zeros, axis=0) - y1 = paddle.maximum( - paddle.minimum( - bbox[:, :, 1], origin_h, axis=0), zeros, axis=0) - x2 = paddle.maximum( - paddle.minimum( - bbox[:, :, 2], origin_w, axis=0), zeros, axis=0) - y2 = paddle.maximum( - paddle.minimum( - bbox[:, :, 3], origin_h, axis=0), zeros, axis=0) + x1 = paddle.maximum(paddle.minimum(bbox[:, :, 0], origin_w), zeros) + y1 = paddle.maximum(paddle.minimum(bbox[:, :, 1], origin_h), zeros) + x2 = paddle.maximum(paddle.minimum(bbox[:, :, 2], origin_w), zeros) + y2 = paddle.maximum(paddle.minimum(bbox[:, :, 3], origin_h), zeros) bbox = paddle.stack([x1, y1, x2, y2], axis=-1) bboxes = (bbox, rois_num)