feat(keypoints) add MSPN (#33)

a23f89d0 · greatlog · GitHub · 514908dd · a23f89d0 · a23f89d0
11 changed file
--- a/README.md
+++ b/README.md
@@ -88,6 +88,16 @@ export PYTHONPATH=/path/to/models:$PYTHONPATH
 |  :--:          |:--:     |:--:           |:--:         |
 |  Deeplabv3plus | Resnet101   | 79.0          | 79.8        |

+### 人体关节点检测
+
+我们提供了人体关节点检测的经典模型[SimpleBaseline](https://arxiv.org/pdf/1804.06208.pdf)和高精度模型[MSPN](https://arxiv.org/pdf/1901.00148.pdf)，使用在COCO val2017上人体检测AP为56的检测结果，提供的模型在COCO val2017上的关节点检测结果为:
+
+|Methods|Backbone|Input Size| AP | Ap .5 | AP .75 | AP (M) | AP (L) | AR | AR .5 | AR .75 | AR (M) | AR (L) |
+|---|:---:|---|---|---|---|---|---|---|---|---|---|---|
+| SimpleBaseline |Res50 |256x192| 0.712 | 0.887 | 0.779 | 0.673 | 0.785 | 0.782 | 0.932 | 0.839 | 0.730 | 0.854 |
+| SimpleBaseline |Res101|256x192| 0.722 | 0.891 | 0.795 | 0.687 | 0.795 | 0.794 | 0.936 | 0.855 | 0.745 | 0.863 |
+| SimpleBaseline |Res152|256x192| 0.724 | 0.888 | 0.794 | 0.688 | 0.795 | 0.795 | 0.934 | 0.856 | 0.746 | 0.863 |
+| MSPN_4stage |MSPN|256x192| 0.752 | 0.900 | 0.819 | 0.716 | 0.825 | 0.819 | 0.943 | 0.875 | 0.770 | 0.887 |

 ### 自然语言处理


--- a/hubconf.py
+++ b/hubconf.py
@@ -43,4 +43,7 @@ from official.vision.keypoints.models import (
        simplebaseline_res50,
        simplebaseline_res101,
        simplebaseline_res152,
+        mspn_4stage
 )
+
+from official.vision.keypoints.inference import KeypointEvaluator
--- a/official/vision/keypoints/README.md
+++ b/official/vision/keypoints/README.md
@@ -5,10 +5,10 @@
 本目录使用了在COCO val2017上的Human AP为56.4的人体检测结果，最后在COCO val2017上人体关节点估计结果为
 |Methods|Backbone|Input Size| AP | Ap .5 | AP .75 | AP (M) | AP (L) | AR | AR .5 | AR .75 | AR (M) | AR (L) |
 |---|:---:|---|---|---|---|---|---|---|---|---|---|---|
-| SimpleBaseline |Res50 |256x192| 71.2 | 0.887 | 0.779 | 0.673 | 0.785 | 0.782 | 0.932 | 0.839 | 0.730 | 0.854 |
-| SimpleBaseline |Res101|256x192| 72.2 | 0.891 | 0.795 | 0.687 | 0.795 | 0.794 | 0.936 | 0.855 | 0.745 | 0.863 |
-| SimpleBaseline |Res152|256x192| 72.4 | 0.888 | 0.794 | 0.688 | 0.795 | 0.795 | 0.934 | 0.856 | 0.746 | 0.863 |
-
+| SimpleBaseline |Res50 |256x192| 0.712 | 0.887 | 0.779 | 0.673 | 0.785 | 0.782 | 0.932 | 0.839 | 0.730 | 0.854 |
+| SimpleBaseline |Res101|256x192| 0.722 | 0.891 | 0.795 | 0.687 | 0.795 | 0.794 | 0.936 | 0.855 | 0.745 | 0.863 |
+| SimpleBaseline |Res152|256x192| 0.724 | 0.888 | 0.794 | 0.688 | 0.795 | 0.795 | 0.934 | 0.856 | 0.746 | 0.863 |
+| MSPN_4stage |MSPN|256x192| 0.752 | 0.900 | 0.819 | 0.716 | 0.825 | 0.819 | 0.943 | 0.875 | 0.770 | 0.887 |

 ## 安装和环境配置

@@ -38,8 +38,7 @@ ${COCO_DATA_ROOT}
 |   |-- person_keypoints_val2017.json
 |-- person_detection_results
 |   |-- COCO_val2017_detections_AP_H_56_person.json
-|-- images
-    |-- train2017
+|-- |-- train2017
    |   |-- 000000000009.jpg
    |   |-- 000000000025.jpg
    |   |-- 000000000030.jpg
@@ -51,28 +50,31 @@ ${COCO_DATA_ROOT}
        |-- ... 
 ```

+更改[config.py](.config.py)中的`data_root`为${COCO_DATA_ROOT}

 3、开始训练:

 `train.py`的命令行参数如下:
- `--arch`, 训练的模型的名字
- `--data_root`，COCO数据集里`images`的路径；
- `--ann_file`, COCO数据集里标注文件的`json`路径
- `--batch_size`，训练时采用的batch size, 默认32；
- `--ngpus`, 训练时采用的gpu数量，默认8; 当设置为1时，表示单卡训练
- `--continue`, 是否从已训好的模型继续训练；
- `--epochs`, 需要训练的epoch数量；
- `--lr`, 初始学习率；
+- `--arch`, 训练的网络的名字
+- `--resume`, 是否从已训好的模型继续训练
+- `--ngpus`, 使用的GPU数量
+- `--multi_scale_supervision`, 是否使用多尺度监督；

+例如训练SimpleBaseline_Res50:
 ```bash
-python3 train.py --arch name/of/model \
-                 --data_root /path/to/COCO/images \
-                 --ann_file /path/to/person_keypoints.json \
-                 --batch_size 32 \
-                 --lr 0.0003 \
+python3 train.py --arch simplebaseline_res50 \
+                 --resume /path/to/model \
                 --ngpus 8 \
-                 --epochs 200 \
-                 --continue /path/to/model
+                 --multi_scale_supervision False
+                 
+```
+训练MSPN:
+```bash
+python3 train.py --arch mspn_4stage \
+                 --resume /path/to/model \
+                 --ngpus 8 \
+                 --multi_scale_supervision True
+
 ```

 ## 如何测试
@@ -80,37 +82,34 @@ python3 train.py --arch name/of/model \
 模型训练好之后，可以通过如下命令测试模型在COCOval2017验证集的性能：

 ```bash
-python3 test.py --arch name/of/model \
-                --data_root /path/to/COCO/images \
+python3 test.py --arch name/of/network \
                --model /path/to/model.pkl \
-                --gt_path /path/to/ground/truth/annotations
-                --dt_path /path/to/human/detection/results
+                --dt_file /name/human/detection/results
 ```

 `test.py`的命令行参数如下：
- `--arch`, 训练的模型的名字
- `--data_root`，COCO数据集里`images`的路径;
- `--gt_path`, COCO数据集里验证集的标注文件;
- `--dt_path`，人体检测结果；
- `--model`, 待检测的模型
+- `--arch`, 网络的名字;
+- `--model`, 待检测的模;
+- `--dt_path`，人体检测结果.

 ## 如何使用

 模型训练好之后，可以通过如下命令测试单张图片(先使用预训练的RetainNet检测出人的框），得到人体姿态可视化结果：

 ```bash
-python3 inference.py --arch /name/of/tested/model \
+python3 inference.py --arch /name/of/tested/network \
+                     --detector /name/of/human/detector \
                     --model /path/to/model \
                     --image /path/to/image.jpg
 ```

 `inference.py`的命令行参数如下：
 - `--arch`, 网络的名字;
+- `--detector`, 人体检测器的名字;
 - `--model`，载入训练好的模型;
- `--image`，载入待测试的图像
+- `--image`，载入待测试的图像.

 ## 参考文献

 - [Simple Baselines for Human Pose Estimation and Tracking](https://arxiv.org/pdf/1804.06208.pdf), Bin Xiao, Haiping Wu, and Yichen Wei
- [Rethinking on Multi-Stage Networks for Human Pose Estimation](https://arxiv.org/pdf/1901.00148.pdf) Wenbo Li1, Zhicheng Wang, Binyi Yin, Qixiang Peng, Yuming Du, Tianzi Xiao, Gang Yu, Hongtao Lu, Yichen Wei and Jian Sun
-
+- [Rethinking on Multi-Stage Networks for Human Pose Estimation](https://arxiv.org/pdf/1901.00148.pdf) Wenbo Li1, Zhicheng Wang, Binyi Yin, Qixiang Peng, Yuming Du, Tianzi Xiao, Gang Yu, Hongtao Lu, Yichen Wei and Jian Sun
\ No newline at end of file
--- a/official/vision/keypoints/config.py
+++ b/official/vision/keypoints/config.py
@@ -8,18 +8,21 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 class Config:
    ##############3## train ##############################################
+    initial_lr = 3e-4
    lr_ratio = 0.1
+
+    batch_size = 32
+    epochs = 200
    warm_epochs = 1
    weight_decay = 1e-5

-    half_body_transform = True
-    extend_boxes = True
-
    ################## data ###############################################
-    # basic
+    # path
+    data_root = "/data/coco_data/"
+
    # normalize
-    IMG_MEAN = [0.485 * 255, 0.456 * 255, 0.406 * 255]
-    IMG_STD = [0.229 * 255, 0.224 * 255, 0.225 * 255]
+    img_mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
+    img_std = [0.229 * 255, 0.224 * 255, 0.225 * 255]

    # shape
    input_shape = (256, 192)
@@ -27,11 +30,15 @@ class Config:

    # heat maps
    keypoint_num = 17
-    heat_kernel = 1.5
+    heat_kernel = [2.6, 2.0, 1.7, 1.4]
    heat_thr = 1e-2
    heat_range = 255

    ##################### augumentation #####################################
+
+    half_body_transform = True
+    extend_boxes = True
+
    # extend
    x_ext = 0.6
    y_ext = 0.6
@@ -58,3 +65,46 @@ class Config:
    test_y_ext = 0.10
    test_gaussian_kernel = 17
    second_value_aug = True
+
+    vis_colors = [
+        [255, 0, 0],
+        [255, 85, 0],
+        [255, 170, 0],
+        [255, 255, 0],
+        [170, 255, 0],
+        [85, 255, 0],
+        [0, 255, 0],
+        [0, 255, 85],
+        [0, 255, 170],
+        [0, 255, 255],
+        [0, 170, 255],
+        [0, 85, 255],
+        [0, 0, 255],
+        [85, 0, 255],
+        [170, 0, 255],
+        [255, 0, 255],
+        [255, 0, 170],
+        [255, 0, 85],
+        [255, 85, 85],
+        [255, 170, 85],
+        [255, 170, 170],
+    ]
+
+    vis_skeletons = [
+        [0, 1],
+        [0, 2],
+        [1, 3],
+        [2, 4],
+        [5, 6],
+        [5, 7],
+        [7, 9],
+        [6, 8],
+        [8, 10],
+        [5, 11],
+        [6, 12],
+        [11, 12],
+        [11, 13],
+        [13, 15],
+        [12, 14],
+        [14, 16],
+    ]
--- a/official/vision/keypoints/dataset.py
+++ b/official/vision/keypoints/dataset.py
@@ -127,7 +127,6 @@ class COCOJoints(VisionDataset):

        ann = self.anns[index]
        img_id = ann["image_id"]
-
        target = []
        for k in self.order:
            if k == "image":

--- a/official/vision/keypoints/inference.py
+++ b/official/vision/keypoints/inference.py
@@ -20,7 +20,7 @@ from official.vision.keypoints.transforms import get_affine_transform
 from official.vision.keypoints.config import Config as cfg

 import official.vision.keypoints.models as M
-import official.vision.detection.retinanet_res50_1x_800size as Det
+import official.vision.detection.retinanet_res50_coco_1x_800size as Det
 from official.vision.detection.tools.test import DetEvaluator
 from official.vision.keypoints.test import find_keypoints

@@ -38,10 +38,11 @@ def make_parser():
            "simplebaseline_res50",
            "simplebaseline_res101",
            "simplebaseline_res152",
+            "mspn_4stage",
        ],
    )
    parser.add_argument(
-        "-det", "--detector", default="retinanet_res50_1x_800size", type=str,
+        "-det", "--detector", default="retinanet_res50_coco_1x_800size", type=str,
    )

    parser.add_argument(
@@ -51,37 +52,127 @@ def make_parser():
        type=str,
    )
    parser.add_argument(
-        "-image", "--image", default="/data/test_keyoint.jpeg", type=str
+        "-image", "--image", default="/data/test_keypoint.jpeg", type=str
    )
    return parser


-def vis_skeleton(img, all_keypoints):
+class KeypointEvaluator:
+    def __init__(self, detect_model, det_func, keypoint_model, keypoint_func):

-    canvas = img.copy()
-    for keypoints in all_keypoints:
-        for ind, skeleton in enumerate(cfg.vis_skeletons):
-            jotint1 = skeleton[0]
-            jotint2 = skeleton[1]
+        self.detector = detect_model
+        self.det_func = det_func

-            X = np.array([keypoints[jotint1, 0], keypoints[jotint2, 0]])
+        self.keypoint_model = keypoint_model
+        self.keypoint_func = keypoint_func

-            Y = np.array([keypoints[jotint1, 1], keypoints[jotint2, 1]])
+    def detect_persons(self, image):

-            mX = np.mean(X)
-            mY = np.mean(Y)
-            length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
+        data, im_info = DetEvaluator.process_inputs(
+            image.copy(),
+            self.detector.cfg.test_image_short_size,
+            self.detector.cfg.test_image_max_size,
+        )

-            angle = math.degrees(math.atan2(Y[0] - Y[1], X[0] - X[1]))
-            polygon = cv2.ellipse2Poly(
-                (int(mX), int(mY)), (int(length / 2), 4), int(angle), 0, 360, 1
-            )
+        self.detector.inputs["im_info"].set_value(im_info)
+        self.detector.inputs["image"].set_value(data.astype(np.float32))

-            cur_canvas = canvas.copy()
-            cv2.fillConvexPoly(cur_canvas, polygon, cfg.vis_colors[ind])
-            canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
+        evaluator = DetEvaluator(self.detector)
+        det_res = evaluator.predict(self.det_func)

-    return canvas
+        persons = []
+        for d in det_res:
+            cls_id = int(d[5] + 1)
+            if cls_id == 1:
+                bbox = d[:4]
+                persons.append(bbox)
+        return persons
+
+    def predict_single_person(self, image, bbox):
+
+        w = bbox[2] - bbox[0]
+        h = bbox[3] - bbox[1]
+
+        center_x = (bbox[0] + bbox[2]) / 2
+        center_y = (bbox[1] + bbox[3]) / 2
+
+        extend_w = w * (1 + cfg.test_x_ext)
+        extend_h = h * (1 + cfg.test_y_ext)
+
+        w_h_ratio = cfg.input_shape[1] / cfg.input_shape[0]
+        if extend_w / extend_h > w_h_ratio:
+            extend_h = extend_w / w_h_ratio
+        else:
+            extend_w = extend_h * w_h_ratio
+
+        trans = get_affine_transform(
+            np.array([center_x, center_y]),
+            np.array([extend_h, extend_w]),
+            1,
+            0,
+            cfg.input_shape,
+        )
+
+        croped_img = cv2.warpAffine(
+            image,
+            trans,
+            (int(cfg.input_shape[1]), int(cfg.input_shape[0])),
+            flags=cv2.INTER_LINEAR,
+            borderValue=0,
+        )
+
+        fliped_img = croped_img[:, ::-1]
+        keypoint_input = np.stack([croped_img, fliped_img], 0)
+        keypoint_input = keypoint_input.transpose(0, 3, 1, 2)
+        keypoint_input = np.ascontiguousarray(keypoint_input).astype(np.float32)
+
+        self.keypoint_model.inputs["image"].set_value(keypoint_input)
+
+        outs = self.keypoint_func()
+        outs = outs.numpy()
+        pred = outs[0]
+        fliped_pred = outs[1][cfg.keypoint_flip_order][:, :, ::-1]
+        pred = (pred + fliped_pred) / 2
+
+        keypoints = find_keypoints(pred, bbox)
+
+        return keypoints
+
+    def predict(self, image, bboxes):
+        normalized_img = (image - np.array(cfg.img_mean).reshape(1, 1, 3)) / np.array(
+            cfg.img_std
+        ).reshape(1, 1, 3)
+        all_keypoints = []
+        for bbox in bboxes:
+            keypoints = self.predict_single_person(normalized_img, bbox)
+            all_keypoints.append(keypoints)
+        return all_keypoints
+
+    @staticmethod
+    def vis_skeletons(img, all_keypoints):
+        canvas = img.copy()
+        for keypoints in all_keypoints:
+            for ind, skeleton in enumerate(cfg.vis_skeletons):
+                jotint1 = skeleton[0]
+                jotint2 = skeleton[1]
+
+                X = np.array([keypoints[jotint1, 0], keypoints[jotint2, 0]])
+
+                Y = np.array([keypoints[jotint1, 1], keypoints[jotint2, 1]])
+
+                mX = np.mean(X)
+                mY = np.mean(Y)
+                length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
+
+                angle = math.degrees(math.atan2(Y[0] - Y[1], X[0] - X[1]))
+                polygon = cv2.ellipse2Poly(
+                    (int(mX), int(mY)), (int(length / 2), 4), int(angle), 0, 360, 1
+                )
+
+                cur_canvas = canvas.copy()
+                cv2.fillConvexPoly(cur_canvas, polygon, cfg.vis_colors[ind])
+                canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
+        return canvas


 def main():
@@ -108,78 +199,18 @@ def main():
        pred = keypoint_model.predict()
        return pred

-    ori_img = cv2.imread(args.image)
-    data, im_info = DetEvaluator.process_inputs(
-        ori_img.copy(),
-        detector.cfg.test_image_short_size,
-        detector.cfg.test_image_max_size,
-    )
-    detector.inputs["im_info"].set_value(im_info)
-    detector.inputs["image"].set_value(data.astype(np.float32))
+    evaluator = KeypointEvaluator(detector, det_func, keypoint_model, keypoint_func)

-    logger.info("Detecting Humans")
-    evaluator = DetEvaluator(detector)
-    det_res = evaluator.predict(det_func)
+    image = cv2.imread(args.image)

-    normalized_img = (ori_img - np.array(cfg.IMG_MEAN).reshape(1, 1, 3)) / np.array(
-        cfg.IMG_STD
-    ).reshape(1, 1, 3)
+    logger.info("Detecting Humans")
+    person_boxes = evaluator.detect_persons(image)

    logger.info("Detecting Keypoints")
-    all_keypoints = []
-    for det in det_res:
-        cls_id = int(det[5] + 1)
-        if cls_id == 1:
-            bbox = det[:4]
-            w = bbox[2] - bbox[0]
-            h = bbox[3] - bbox[1]
-
-            center_x = (bbox[0] + bbox[2]) / 2
-            center_y = (bbox[1] + bbox[3]) / 2
-
-            extend_w = w * (1 + cfg.test_x_ext)
-            extend_h = h * (1 + cfg.test_y_ext)
-
-            w_h_ratio = cfg.input_shape[1] / cfg.input_shape[0]
-            if extend_w / extend_h > w_h_ratio:
-                extend_h = extend_w / w_h_ratio
-            else:
-                extend_w = extend_h * w_h_ratio
-
-            trans = get_affine_transform(
-                np.array([center_x, center_y]),
-                np.array([extend_h, extend_w]),
-                1,
-                0,
-                cfg.input_shape,
-            )
-
-            croped_img = cv2.warpAffine(
-                normalized_img,
-                trans,
-                (int(cfg.input_shape[1]), int(cfg.input_shape[0])),
-                flags=cv2.INTER_LINEAR,
-                borderValue=0,
-            )
-
-            fliped_img = croped_img[:, ::-1]
-            keypoint_input = np.stack([croped_img, fliped_img], 0)
-            keypoint_input = keypoint_input.transpose(0, 3, 1, 2)
-            keypoint_input = np.ascontiguousarray(keypoint_input).astype(np.float32)
-
-            keypoint_model.inputs["image"].set_value(keypoint_input)
-
-            outs = keypoint_func()
-            outs = outs.numpy()
-            pred = outs[0]
-            fliped_pred = outs[1][cfg.keypoint_flip_order][:, :, ::-1]
-            pred = (pred + fliped_pred) / 2
-
-            keypoints = find_keypoints(pred, bbox)
-            all_keypoints.append(keypoints)
+    all_keypoints = evaluator.predict(image, person_boxes)

    logger.info("Visualizing")
-    canvas = vis_skeleton(ori_img, all_keypoints)
+    canvas = evaluator.vis_skeletons(image, all_keypoints)
    cv2.imwrite("vis_skeleton.jpg", canvas)



--- a/official/vision/keypoints/models/__init__.py
+++ b/official/vision/keypoints/models/__init__.py
@@ -11,3 +11,5 @@ from .simplebaseline import (
    simplebaseline_res101,
    simplebaseline_res152,
 )
+
+from .mspn import mspn_4stage
--- a/official/vision/keypoints/models/mspn.py
+++ b/official/vision/keypoints/models/mspn.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import megengine as mge
+import megengine.functional as F
+import megengine.hub as hub
+import megengine.module as M
+import math
+import official.vision.classification.resnet.model as resnet
+
+import numpy as np
+
+
+class ResnetBody(M.Module):
+    def __init__(
+        self,
+        block,
+        init_channel,
+        layers,
+        channels,
+        zero_init_residual=False,
+        norm=M.BatchNorm2d,
+    ):
+        super(ResnetBody, self).__init__()
+        self.in_channels = init_channel
+        self.layer1 = self._make_layer(
+            block, channels[0], layers[0], stride=1, norm=norm
+        )
+
+        self.layer2 = self._make_layer(
+            block, channels[1], layers[1], stride=2, norm=norm
+        )
+
+        self.layer3 = self._make_layer(
+            block, channels[2], layers[2], stride=2, norm=norm,
+        )
+
+        self.layer4 = self._make_layer(
+            block, channels[3], layers[3], stride=2, norm=norm,
+        )
+
+        for m in self.modules():
+            if isinstance(m, M.Conv2d):
+                M.init.msra_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+                if m.bias is not None:
+                    fan_in, _ = M.init.calculate_fan_in_and_fan_out(m.weight)
+                    bound = 1 / math.sqrt(fan_in)
+                    M.init.uniform_(m.bias, -bound, bound)
+            elif isinstance(m, M.BatchNorm2d):
+                M.init.ones_(m.weight)
+                M.init.zeros_(m.bias)
+            elif isinstance(m, M.Linear):
+                M.init.msra_uniform_(m.weight, a=math.sqrt(5))
+                if m.bias is not None:
+                    fan_in, _ = M.init.calculate_fan_in_and_fan_out(m.weight)
+                    bound = 1 / math.sqrt(fan_in)
+                    M.init.uniform_(m.bias, -bound, bound)
+
+    def _make_layer(self, block, channels, blocks, stride=1, norm=M.BatchNorm2d):
+        layers = []
+        layers.append(block(self.in_channels, channels, stride, norm=norm))
+        self.in_channels = channels * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.in_channels, channels, norm=norm))
+
+        return M.Sequential(*layers)
+
+    def forward(self, x):
+        outputs = []
+
+        x = self.layer1(x)
+        outputs.append(x)
+        x = self.layer2(x)
+        outputs.append(x)
+        x = self.layer3(x)
+        outputs.append(x)
+        x = self.layer4(x)
+        outputs.append(x)
+
+        return outputs
+
+
+class SingleStage(M.Module):
+    def __init__(
+        self, block, init_channel, layers, channels, mid_channel, norm=M.BatchNorm2d
+    ):
+        super(SingleStage, self).__init__()
+        self.down = ResnetBody(block, init_channel, layers, channels, norm)
+        channel = block.expansion * channels[-1]
+        self.up1 = M.Sequential(
+            M.Conv2d(channel, mid_channel, 1, 1, 0), norm(mid_channel)
+        )
+        self.deconv1 = M.Sequential(
+            M.ConvTranspose2d(mid_channel, mid_channel, 4, 2, 1), norm(mid_channel)
+        )
+
+        channel = block.expansion * channels[-2]
+        self.up2 = M.Sequential(
+            M.Conv2d(channel, mid_channel, 1, 1, 0), norm(mid_channel)
+        )
+        self.deconv2 = M.Sequential(
+            M.ConvTranspose2d(mid_channel, mid_channel, 4, 2, 1), norm(mid_channel)
+        )
+
+        channel = block.expansion * channels[-3]
+        self.up3 = M.Sequential(
+            M.Conv2d(channel, mid_channel, 1, 1, 0), norm(mid_channel)
+        )
+        self.deconv3 = M.Sequential(
+            M.ConvTranspose2d(mid_channel, mid_channel, 4, 2, 1), norm(mid_channel)
+        )
+
+        channel = block.expansion * channels[-4]
+        self.up4 = M.Sequential(
+            M.Conv2d(channel, mid_channel, 1, 1, 0), norm(mid_channel)
+        )
+
+    def forward(self, x):
+        branches = self.down(x)
+        branches = list(reversed(branches))
+
+        outputs = []
+        f_up = F.relu(self.up1(branches[0]))
+
+        outputs.append(f_up)
+
+        f = self.up2(branches[1])
+        f_up = F.relu(self.deconv1(f_up) + f)
+        outputs.append(f_up)
+
+        f = self.up3(branches[2])
+        f_up = F.relu(self.deconv2(f_up) + f)
+        outputs.append(f_up)
+
+        f = self.up4(branches[3])
+        f_up = F.relu(self.deconv3(f_up) + f)
+        outputs.append(f_up)
+
+        return outputs
+
+
+class MSPN(M.Module):
+    def __init__(self, block, layers, channels, mid_channel, keypoint_num, nr_stg):
+        super(MSPN, self).__init__()
+
+        block = getattr(resnet, block)
+        norm = M.BatchNorm2d
+
+        self.nr_stg = nr_stg
+        self.keypoint_num = keypoint_num
+
+        self.head = M.Sequential(
+            M.Conv2d(3, 64, 3, 2, 1),
+            norm(64),
+            M.ReLU(),
+            M.Conv2d(64, 64, 3, 1, 1),
+            norm(64),
+            M.ReLU(),
+            M.Conv2d(64, 64, 3, 2, 1),
+            norm(64),
+            M.ReLU(),
+        )
+
+        self.stages = {}
+        for i in range(nr_stg):
+            init_channel = 64
+            self.stages["Stage_{}_body".format(i)] = SingleStage(
+                block, init_channel, layers, channels, mid_channel, norm
+            )
+            tail = {}
+            for j in range(4):
+                tail["tail_{}".format(j)] = M.Conv2d(mid_channel, keypoint_num, 3, 1, 1)
+            self.stages["Stage_{}_tail".format(i)] = tail
+
+            if i < nr_stg - 1:
+                self.stages["Stage_{}_next".format(i)] = M.Sequential(
+                    M.Conv2d(mid_channel, 64, 1, 1, 0), norm(64), M.ReLU()
+                )
+
+        self.inputs = {
+            "image": mge.tensor(dtype="float32"),
+            "heatmap": mge.tensor(dtype="float32"),
+            "heat_valid": mge.tensor(dtype="float32"),
+        }
+
+    def calc_loss(self):
+        outs = self.forward(self.inputs["image"])
+
+        loss = 0
+        for stage_out in outs:
+            for ind, scale_out in enumerate(stage_out[:-1]):
+                label = (
+                    self.inputs["heatmap"][:, ind]
+                    * (self.inputs["heat_valid"] > 1.1)[:, :, None, None]
+                )
+                tmp = F.square_loss(scale_out, label)
+                loss += tmp / 4 / len(outs)
+
+            # OHKM loss for the largest heatmap
+            tmp = ((stage_out[-1] - self.inputs["heatmap"][:, -1]) ** 2).mean(3).mean(
+                2
+            ) * (self.inputs["heat_valid"] > 0.1)
+            ohkm_loss = 0
+            for i in range(tmp.shape[0]):
+                selected_loss, _ = F.top_k(
+                    tmp[i], self.keypoint_num // 2, descending=True
+                )
+                ohkm_loss += selected_loss.mean()
+            ohkm_loss /= tmp.shape[0]
+            loss += ohkm_loss
+        return loss
+
+    def predict(self):
+        outputs = self.forward(self.inputs["image"])
+        pred = outputs[-1][-1]
+        return pred
+
+    def forward(self, x):
+
+        f = self.head(x)
+        outputs = []
+        for i in range(self.nr_stg):
+            multi_scale_features = self.stages["Stage_{}_body".format(i)](f)
+
+            multi_scale_heatmaps = []
+            for j in range(4):
+                out = self.stages["Stage_{}_tail".format(i)]["tail_{}".format(j)](
+                    multi_scale_features[j]
+                )
+                out = F.interpolate(out, scale_factor=2 ** (3 - j))
+                multi_scale_heatmaps.append(out)
+
+            if i < self.nr_stg - 1:
+                f = self.stages["Stage_{}_next".format(i)](multi_scale_features[-1])
+
+            outputs.append(multi_scale_heatmaps)
+        return outputs
+
+
+@hub.pretrained(
+    "https://data.megengine.org.cn/models/weights/mspn_4stage_256x192_0_255_75_2.pkl"
+)
+def mspn_4stage(**kwargs):
+    model = MSPN(
+        block="Bottleneck",
+        layers=[5, 5, 6, 3],
+        channels=[64, 128, 192, 384],
+        nr_stg=4,
+        mid_channel=256,
+        keypoint_num=17,
+        **kwargs
+    )
+    return model
--- a/official/vision/keypoints/models/simplebaseline.py
+++ b/official/vision/keypoints/models/simplebaseline.py
@@ -13,7 +13,6 @@ import megengine.module as M
 import official.vision.classification.resnet.model as resnet

 import numpy as np
-from functools import partial


 class DeconvLayers(M.Module):
@@ -38,10 +37,12 @@ class DeconvLayers(M.Module):


 class SimpleBaseline(M.Module):
-    def __init__(self, backbone, cfg, pretrained=False):
-
-        norm = partial(M.BatchNorm2d, momentum=cfg.bn_momentum)
-        self.backbone = getattr(resnet, backbone)(norm=norm, pretrained=pretrained)
+    def __init__(self, backbone, cfg):
+        super(SimpleBaseline, self).__init__()
+        norm = M.BatchNorm2d
+        self.backbone = getattr(resnet, backbone)(
+            norm=norm, pretrained=cfg.backbone_pretrained
+        )
        del self.backbone.fc

        self.cfg = cfg
@@ -67,7 +68,7 @@ class SimpleBaseline(M.Module):
    def calc_loss(self):
        out = self.forward(self.inputs["image"])
        valid = self.inputs["heat_valid"][:, :, None, None]
-        label = self.inputs["heatmap"][:, 0]
+        label = self.inputs["heatmap"][:, -1]
        loss = F.square_loss(out * valid, label * valid)
        return loss

@@ -101,8 +102,8 @@ class SimpleBaseline_Config:
    deconv_channels = [256, 256, 256]
    deconv_kernel_sizes = [4, 4, 4]
    deconv_with_bias = False
-    bn_momentum = 0.1
    keypoint_num = 17
+    backbone_pretrained = True


 cfg = SimpleBaseline_Config()

--- a/official/vision/keypoints/test.py
+++ b/official/vision/keypoints/test.py
@@ -34,7 +34,7 @@ logger = mge.get_logger(__name__)

 def build_dataloader(rank, world_size, data_root, ann_file):
    val_dataset = COCOJoints(
-        data_root, ann_file, image_set="val", order=("image", "boxes", "info")
+        data_root, ann_file, image_set="val2017", order=("image", "boxes", "info")
    )
    val_sampler = SequentialSampler(val_dataset, 1, world_size=world_size, rank=rank)
    val_dataloader = DataLoader(
@@ -43,7 +43,7 @@ def build_dataloader(rank, world_size, data_root, ann_file):
        num_workers=4,
        transform=T.Compose(
            transforms=[
-                T.Normalize(mean=cfg.IMG_MEAN, std=cfg.IMG_STD),
+                T.Normalize(mean=cfg.img_mean, std=cfg.img_std),
                ExtendBoxes(
                    cfg.test_x_ext,
                    cfg.test_y_ext,
@@ -213,17 +213,10 @@ def worker(
 def make_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument("-n", "--ngpus", default=8, type=int)
-    parser.add_argument("-d", "--data_root", default="/", type=str)
-    parser.add_argument(
-        "-gt",
-        "--gt_path",
-        default="/data/coco/annotations/person_keypoints_val2017.json",
-        type=str,
-    )
    parser.add_argument(
        "-dt",
-        "--dt_path",
-        default="/data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json",
+        "--dt_file",
+        default="COCO_val2017_detections_AP_H_56_person.json",
        type=str,
    )
    parser.add_argument("-se", "--start_epoch", default=-1, type=int)
@@ -237,6 +230,7 @@ def make_parser():
            "simplebaseline_res50",
            "Simplebaseline_res101",
            "Simplebaseline_res152",
+            "mspn_4stage",
        ],
    )
    parser.add_argument(
@@ -255,8 +249,13 @@ def main():
    parser = make_parser()
    args = parser.parse_args()

-    dets = json.load(open(args.dt_path, "r"))
-    eval_gt = COCO(args.gt_path)
+    dt_path = os.path.join(cfg.data_root, "person_detection_results", args.dt_file)
+    dets = json.load(open(dt_path, "r"))
+
+    gt_path = os.path.join(
+        cfg.data_root, "annotations", "person_keypoints_val2017.json"
+    )
+    eval_gt = COCO(gt_path)
    gt = eval_gt.dataset

    dets = [
@@ -285,7 +284,7 @@ def main():
                args=(
                    args.arch,
                    model_file,
-                    args.data_root,
+                    cfg.data_root,
                    ann_file,
                    i,
                    args.ngpus,

--- a/official/vision/keypoints/train.py
+++ b/official/vision/keypoints/train.py
@@ -47,21 +47,12 @@ def main():
            "simplebaseline_res50",
            "simplebaseline_res101",
            "simplebaseline_res152",
+            "mspn_4stage",
        ],
    )
-    parser.add_argument("--pretrained", default=True, type=bool)
    parser.add_argument("-s", "--save", default="/data/models", type=str)
-    parser.add_argument("--data_root", default="/data/coco/images/", type=str)
-    parser.add_argument(
-        "--ann_file",
-        default="/data/coco/annotations/person_keypoints_train2017.json",
-        type=str,
-    )
-    parser.add_argument("--continue", default=None, type=str)

-    parser.add_argument("-b", "--batch_size", default=64, type=int)
-    parser.add_argument("--lr", default=6e-4, type=float)
-    parser.add_argument("--epochs", default=200, type=int)
+    parser.add_argument("--resume", default=None, type=str)

    parser.add_argument("--multi_scale_supervision", default=True, type=bool)

@@ -81,7 +72,7 @@ def main():

    if world_size > 1:
        # scale learning rate by number of gpus
-        args.lr *= world_size
+        cfg.initial_lr *= world_size
        # start distributed training, dispatch sub-processes
        processes = []
        for rank in range(world_size):
@@ -110,30 +101,35 @@ def worker(rank, world_size, args):
    model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1])
    save_dir = os.path.join(args.save, model_name)

-    model = getattr(M, args.arch)(pretrained=args.pretrained)
+    model = getattr(M, args.arch)()
    model.train()
    start_epoch = 0
-    if args.c is not None:
-        file = mge.load(args.c)
+    if args.resume is not None:
+        file = mge.load(args.resume)
        model.load_state_dict(file["state_dict"])
        start_epoch = file["epoch"]

    optimizer = optim.Adam(
-        model.parameters(requires_grad=True), lr=args.lr, weight_decay=cfg.weight_decay,
+        model.parameters(requires_grad=True),
+        lr=cfg.initial_lr,
+        weight_decay=cfg.weight_decay,
    )
    # Build train datasets
    logger.info("preparing dataset..")
+    ann_file = os.path.join(
+        cfg.data_root, "annotations", "person_keypoints_train2017.json"
+    )
    train_dataset = COCOJoints(
-        args.data_root,
-        args.ann_file,
-        image_set="train",
+        cfg.data_root,
+        ann_file,
+        image_set="train2017",
        order=("image", "keypoints", "boxes", "info"),
    )
    train_sampler = data.RandomSampler(
-        train_dataset, batch_size=args.batch_size, drop_last=True
+        train_dataset, batch_size=cfg.batch_size, drop_last=True
    )

-    transforms = [T.Normalize(mean=cfg.IMG_MEAN, std=cfg.IMG_STD)]
+    transforms = [T.Normalize(mean=cfg.img_mean, std=cfg.img_std)]
    if cfg.half_body_transform:
        transforms.append(
            HalfBodyTransform(
@@ -167,14 +163,14 @@ def worker(rank, world_size, args):
            cfg.input_shape,
            cfg.output_shape,
            cfg.keypoint_num,
-            cfg.heat_thre,
+            cfg.heat_thr,
            cfg.heat_kernel if args.multi_scale_supervision else cfg.heat_kernel[-1:],
            cfg.heat_range,
        ),
    )

    # Start training
-    for epoch in range(start_epoch, args.epochs):
+    for epoch in range(start_epoch, cfg.epochs):
        loss = train(model, train_queue, optimizer, args, epoch=epoch)
        logger.info("Epoch %d Train %.6f ", epoch, loss)

@@ -208,10 +204,10 @@ def train(model, data_queue, optimizer, args, epoch=0):
                ) * current_step / cfg.warm_epochs / len(data_queue)
            else:
                lr_factor = 1 - (current_step - len(data_queue) * cfg.warm_epochs) / (
-                    len(data_queue) * (args.epochs - cfg.warm_epochs)
+                    len(data_queue) * (cfg.epochs - cfg.warm_epochs)
                )

-            lr = args.initial_lr * lr_factor
+            lr = cfg.initial_lr * lr_factor
            param_group["lr"] = lr

        lr = optimizer.param_groups[0]["lr"]