diff --git a/README.md b/README.md index ea3918d8517e4904f4450264cda2eaa3acdffd64..cfcfc3f38160626ce178a92c556f369a17e61505 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,16 @@ export PYTHONPATH=/path/to/models:$PYTHONPATH | :--: |:--: |:--: |:--: | | Deeplabv3plus | Resnet101 | 79.0 | 79.8 | +### 人体关节点检测 + +我们提供了人体关节点检测的经典模型[SimpleBaseline](https://arxiv.org/pdf/1804.06208.pdf)和高精度模型[MSPN](https://arxiv.org/pdf/1901.00148.pdf),使用在COCO val2017上人体检测AP为56的检测结果,提供的模型在COCO val2017上的关节点检测结果为: + +|Methods|Backbone|Input Size| AP | Ap .5 | AP .75 | AP (M) | AP (L) | AR | AR .5 | AR .75 | AR (M) | AR (L) | +|---|:---:|---|---|---|---|---|---|---|---|---|---|---| +| SimpleBaseline |Res50 |256x192| 0.712 | 0.887 | 0.779 | 0.673 | 0.785 | 0.782 | 0.932 | 0.839 | 0.730 | 0.854 | +| SimpleBaseline |Res101|256x192| 0.722 | 0.891 | 0.795 | 0.687 | 0.795 | 0.794 | 0.936 | 0.855 | 0.745 | 0.863 | +| SimpleBaseline |Res152|256x192| 0.724 | 0.888 | 0.794 | 0.688 | 0.795 | 0.795 | 0.934 | 0.856 | 0.746 | 0.863 | +| MSPN_4stage |MSPN|256x192| 0.752 | 0.900 | 0.819 | 0.716 | 0.825 | 0.819 | 0.943 | 0.875 | 0.770 | 0.887 | ### 自然语言处理 diff --git a/hubconf.py b/hubconf.py index 86f942b4cc9e6c5048d95e2a3b29d82f1fcf212b..5742000cc9b2dddff061c2d555402320ee8f3ad3 100644 --- a/hubconf.py +++ b/hubconf.py @@ -43,4 +43,7 @@ from official.vision.keypoints.models import ( simplebaseline_res50, simplebaseline_res101, simplebaseline_res152, + mspn_4stage ) + +from official.vision.keypoints.inference import KeypointEvaluator diff --git a/official/vision/keypoints/README.md b/official/vision/keypoints/README.md index abbf32dfd4ad90535fd7c38e4fbb2afee5f018be..ba74c0c676183b37e8ecee58c6afbbfba6fcdca3 100644 --- a/official/vision/keypoints/README.md +++ b/official/vision/keypoints/README.md @@ -5,10 +5,10 @@ 本目录使用了在COCO val2017上的Human AP为56.4的人体检测结果,最后在COCO val2017上人体关节点估计结果为 |Methods|Backbone|Input Size| AP | Ap .5 | AP .75 | AP (M) | AP (L) | AR | AR .5 | AR .75 | AR (M) | AR (L) | |---|:---:|---|---|---|---|---|---|---|---|---|---|---| -| SimpleBaseline |Res50 |256x192| 71.2 | 0.887 | 0.779 | 0.673 | 0.785 | 0.782 | 0.932 | 0.839 | 0.730 | 0.854 | -| SimpleBaseline |Res101|256x192| 72.2 | 0.891 | 0.795 | 0.687 | 0.795 | 0.794 | 0.936 | 0.855 | 0.745 | 0.863 | -| SimpleBaseline |Res152|256x192| 72.4 | 0.888 | 0.794 | 0.688 | 0.795 | 0.795 | 0.934 | 0.856 | 0.746 | 0.863 | - +| SimpleBaseline |Res50 |256x192| 0.712 | 0.887 | 0.779 | 0.673 | 0.785 | 0.782 | 0.932 | 0.839 | 0.730 | 0.854 | +| SimpleBaseline |Res101|256x192| 0.722 | 0.891 | 0.795 | 0.687 | 0.795 | 0.794 | 0.936 | 0.855 | 0.745 | 0.863 | +| SimpleBaseline |Res152|256x192| 0.724 | 0.888 | 0.794 | 0.688 | 0.795 | 0.795 | 0.934 | 0.856 | 0.746 | 0.863 | +| MSPN_4stage |MSPN|256x192| 0.752 | 0.900 | 0.819 | 0.716 | 0.825 | 0.819 | 0.943 | 0.875 | 0.770 | 0.887 | ## 安装和环境配置 @@ -38,8 +38,7 @@ ${COCO_DATA_ROOT} | |-- person_keypoints_val2017.json |-- person_detection_results | |-- COCO_val2017_detections_AP_H_56_person.json -|-- images - |-- train2017 +|-- |-- train2017 | |-- 000000000009.jpg | |-- 000000000025.jpg | |-- 000000000030.jpg @@ -51,28 +50,31 @@ ${COCO_DATA_ROOT} |-- ... ``` +更改[config.py](.config.py)中的`data_root`为${COCO_DATA_ROOT} 3、开始训练: `train.py`的命令行参数如下: -- `--arch`, 训练的模型的名字 -- `--data_root`,COCO数据集里`images`的路径; -- `--ann_file`, COCO数据集里标注文件的`json`路径 -- `--batch_size`,训练时采用的batch size, 默认32; -- `--ngpus`, 训练时采用的gpu数量,默认8; 当设置为1时,表示单卡训练 -- `--continue`, 是否从已训好的模型继续训练; -- `--epochs`, 需要训练的epoch数量; -- `--lr`, 初始学习率; +- `--arch`, 训练的网络的名字 +- `--resume`, 是否从已训好的模型继续训练 +- `--ngpus`, 使用的GPU数量 +- `--multi_scale_supervision`, 是否使用多尺度监督; +例如训练SimpleBaseline_Res50: ```bash -python3 train.py --arch name/of/model \ - --data_root /path/to/COCO/images \ - --ann_file /path/to/person_keypoints.json \ - --batch_size 32 \ - --lr 0.0003 \ +python3 train.py --arch simplebaseline_res50 \ + --resume /path/to/model \ --ngpus 8 \ - --epochs 200 \ - --continue /path/to/model + --multi_scale_supervision False + +``` +训练MSPN: +```bash +python3 train.py --arch mspn_4stage \ + --resume /path/to/model \ + --ngpus 8 \ + --multi_scale_supervision True + ``` ## 如何测试 @@ -80,37 +82,34 @@ python3 train.py --arch name/of/model \ 模型训练好之后,可以通过如下命令测试模型在COCOval2017验证集的性能: ```bash -python3 test.py --arch name/of/model \ - --data_root /path/to/COCO/images \ +python3 test.py --arch name/of/network \ --model /path/to/model.pkl \ - --gt_path /path/to/ground/truth/annotations - --dt_path /path/to/human/detection/results + --dt_file /name/human/detection/results ``` `test.py`的命令行参数如下: -- `--arch`, 训练的模型的名字 -- `--data_root`,COCO数据集里`images`的路径; -- `--gt_path`, COCO数据集里验证集的标注文件; -- `--dt_path`,人体检测结果; -- `--model`, 待检测的模型 +- `--arch`, 网络的名字; +- `--model`, 待检测的模; +- `--dt_path`,人体检测结果. ## 如何使用 模型训练好之后,可以通过如下命令测试单张图片(先使用预训练的RetainNet检测出人的框),得到人体姿态可视化结果: ```bash -python3 inference.py --arch /name/of/tested/model \ +python3 inference.py --arch /name/of/tested/network \ + --detector /name/of/human/detector \ --model /path/to/model \ --image /path/to/image.jpg ``` `inference.py`的命令行参数如下: - `--arch`, 网络的名字; +- `--detector`, 人体检测器的名字; - `--model`,载入训练好的模型; -- `--image`,载入待测试的图像 +- `--image`,载入待测试的图像. ## 参考文献 - [Simple Baselines for Human Pose Estimation and Tracking](https://arxiv.org/pdf/1804.06208.pdf), Bin Xiao, Haiping Wu, and Yichen Wei -- [Rethinking on Multi-Stage Networks for Human Pose Estimation](https://arxiv.org/pdf/1901.00148.pdf) Wenbo Li1, Zhicheng Wang, Binyi Yin, Qixiang Peng, Yuming Du, Tianzi Xiao, Gang Yu, Hongtao Lu, Yichen Wei and Jian Sun - +- [Rethinking on Multi-Stage Networks for Human Pose Estimation](https://arxiv.org/pdf/1901.00148.pdf) Wenbo Li1, Zhicheng Wang, Binyi Yin, Qixiang Peng, Yuming Du, Tianzi Xiao, Gang Yu, Hongtao Lu, Yichen Wei and Jian Sun \ No newline at end of file diff --git a/official/vision/keypoints/config.py b/official/vision/keypoints/config.py index 52edfa063820576fd19f6a5c5941d7dcca78fe87..d0765f25d98c26a548113c141676fc83b8ff4d0c 100644 --- a/official/vision/keypoints/config.py +++ b/official/vision/keypoints/config.py @@ -8,18 +8,21 @@ # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. class Config: ##############3## train ############################################## + initial_lr = 3e-4 lr_ratio = 0.1 + + batch_size = 32 + epochs = 200 warm_epochs = 1 weight_decay = 1e-5 - half_body_transform = True - extend_boxes = True - ################## data ############################################### - # basic + # path + data_root = "/data/coco_data/" + # normalize - IMG_MEAN = [0.485 * 255, 0.456 * 255, 0.406 * 255] - IMG_STD = [0.229 * 255, 0.224 * 255, 0.225 * 255] + img_mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] + img_std = [0.229 * 255, 0.224 * 255, 0.225 * 255] # shape input_shape = (256, 192) @@ -27,11 +30,15 @@ class Config: # heat maps keypoint_num = 17 - heat_kernel = 1.5 + heat_kernel = [2.6, 2.0, 1.7, 1.4] heat_thr = 1e-2 heat_range = 255 ##################### augumentation ##################################### + + half_body_transform = True + extend_boxes = True + # extend x_ext = 0.6 y_ext = 0.6 @@ -58,3 +65,46 @@ class Config: test_y_ext = 0.10 test_gaussian_kernel = 17 second_value_aug = True + + vis_colors = [ + [255, 0, 0], + [255, 85, 0], + [255, 170, 0], + [255, 255, 0], + [170, 255, 0], + [85, 255, 0], + [0, 255, 0], + [0, 255, 85], + [0, 255, 170], + [0, 255, 255], + [0, 170, 255], + [0, 85, 255], + [0, 0, 255], + [85, 0, 255], + [170, 0, 255], + [255, 0, 255], + [255, 0, 170], + [255, 0, 85], + [255, 85, 85], + [255, 170, 85], + [255, 170, 170], + ] + + vis_skeletons = [ + [0, 1], + [0, 2], + [1, 3], + [2, 4], + [5, 6], + [5, 7], + [7, 9], + [6, 8], + [8, 10], + [5, 11], + [6, 12], + [11, 12], + [11, 13], + [13, 15], + [12, 14], + [14, 16], + ] diff --git a/official/vision/keypoints/dataset.py b/official/vision/keypoints/dataset.py index 9318b4d5c06fcd89b3cbba46c2152ad641451d55..f2fc9cd6240dd0bb87544d8dafbb98b6b06c8c7d 100644 --- a/official/vision/keypoints/dataset.py +++ b/official/vision/keypoints/dataset.py @@ -127,7 +127,6 @@ class COCOJoints(VisionDataset): ann = self.anns[index] img_id = ann["image_id"] - target = [] for k in self.order: if k == "image": diff --git a/official/vision/keypoints/inference.py b/official/vision/keypoints/inference.py index 006418ab06565e2af394e7881cf3e92880fa786d..7c1c49b51fb98b3aaa9d55968df299ebfbf18a2a 100644 --- a/official/vision/keypoints/inference.py +++ b/official/vision/keypoints/inference.py @@ -20,7 +20,7 @@ from official.vision.keypoints.transforms import get_affine_transform from official.vision.keypoints.config import Config as cfg import official.vision.keypoints.models as M -import official.vision.detection.retinanet_res50_1x_800size as Det +import official.vision.detection.retinanet_res50_coco_1x_800size as Det from official.vision.detection.tools.test import DetEvaluator from official.vision.keypoints.test import find_keypoints @@ -38,10 +38,11 @@ def make_parser(): "simplebaseline_res50", "simplebaseline_res101", "simplebaseline_res152", + "mspn_4stage", ], ) parser.add_argument( - "-det", "--detector", default="retinanet_res50_1x_800size", type=str, + "-det", "--detector", default="retinanet_res50_coco_1x_800size", type=str, ) parser.add_argument( @@ -51,37 +52,127 @@ def make_parser(): type=str, ) parser.add_argument( - "-image", "--image", default="/data/test_keyoint.jpeg", type=str + "-image", "--image", default="/data/test_keypoint.jpeg", type=str ) return parser -def vis_skeleton(img, all_keypoints): +class KeypointEvaluator: + def __init__(self, detect_model, det_func, keypoint_model, keypoint_func): - canvas = img.copy() - for keypoints in all_keypoints: - for ind, skeleton in enumerate(cfg.vis_skeletons): - jotint1 = skeleton[0] - jotint2 = skeleton[1] + self.detector = detect_model + self.det_func = det_func - X = np.array([keypoints[jotint1, 0], keypoints[jotint2, 0]]) + self.keypoint_model = keypoint_model + self.keypoint_func = keypoint_func - Y = np.array([keypoints[jotint1, 1], keypoints[jotint2, 1]]) + def detect_persons(self, image): - mX = np.mean(X) - mY = np.mean(Y) - length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5 + data, im_info = DetEvaluator.process_inputs( + image.copy(), + self.detector.cfg.test_image_short_size, + self.detector.cfg.test_image_max_size, + ) - angle = math.degrees(math.atan2(Y[0] - Y[1], X[0] - X[1])) - polygon = cv2.ellipse2Poly( - (int(mX), int(mY)), (int(length / 2), 4), int(angle), 0, 360, 1 - ) + self.detector.inputs["im_info"].set_value(im_info) + self.detector.inputs["image"].set_value(data.astype(np.float32)) - cur_canvas = canvas.copy() - cv2.fillConvexPoly(cur_canvas, polygon, cfg.vis_colors[ind]) - canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0) + evaluator = DetEvaluator(self.detector) + det_res = evaluator.predict(self.det_func) - return canvas + persons = [] + for d in det_res: + cls_id = int(d[5] + 1) + if cls_id == 1: + bbox = d[:4] + persons.append(bbox) + return persons + + def predict_single_person(self, image, bbox): + + w = bbox[2] - bbox[0] + h = bbox[3] - bbox[1] + + center_x = (bbox[0] + bbox[2]) / 2 + center_y = (bbox[1] + bbox[3]) / 2 + + extend_w = w * (1 + cfg.test_x_ext) + extend_h = h * (1 + cfg.test_y_ext) + + w_h_ratio = cfg.input_shape[1] / cfg.input_shape[0] + if extend_w / extend_h > w_h_ratio: + extend_h = extend_w / w_h_ratio + else: + extend_w = extend_h * w_h_ratio + + trans = get_affine_transform( + np.array([center_x, center_y]), + np.array([extend_h, extend_w]), + 1, + 0, + cfg.input_shape, + ) + + croped_img = cv2.warpAffine( + image, + trans, + (int(cfg.input_shape[1]), int(cfg.input_shape[0])), + flags=cv2.INTER_LINEAR, + borderValue=0, + ) + + fliped_img = croped_img[:, ::-1] + keypoint_input = np.stack([croped_img, fliped_img], 0) + keypoint_input = keypoint_input.transpose(0, 3, 1, 2) + keypoint_input = np.ascontiguousarray(keypoint_input).astype(np.float32) + + self.keypoint_model.inputs["image"].set_value(keypoint_input) + + outs = self.keypoint_func() + outs = outs.numpy() + pred = outs[0] + fliped_pred = outs[1][cfg.keypoint_flip_order][:, :, ::-1] + pred = (pred + fliped_pred) / 2 + + keypoints = find_keypoints(pred, bbox) + + return keypoints + + def predict(self, image, bboxes): + normalized_img = (image - np.array(cfg.img_mean).reshape(1, 1, 3)) / np.array( + cfg.img_std + ).reshape(1, 1, 3) + all_keypoints = [] + for bbox in bboxes: + keypoints = self.predict_single_person(normalized_img, bbox) + all_keypoints.append(keypoints) + return all_keypoints + + @staticmethod + def vis_skeletons(img, all_keypoints): + canvas = img.copy() + for keypoints in all_keypoints: + for ind, skeleton in enumerate(cfg.vis_skeletons): + jotint1 = skeleton[0] + jotint2 = skeleton[1] + + X = np.array([keypoints[jotint1, 0], keypoints[jotint2, 0]]) + + Y = np.array([keypoints[jotint1, 1], keypoints[jotint2, 1]]) + + mX = np.mean(X) + mY = np.mean(Y) + length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5 + + angle = math.degrees(math.atan2(Y[0] - Y[1], X[0] - X[1])) + polygon = cv2.ellipse2Poly( + (int(mX), int(mY)), (int(length / 2), 4), int(angle), 0, 360, 1 + ) + + cur_canvas = canvas.copy() + cv2.fillConvexPoly(cur_canvas, polygon, cfg.vis_colors[ind]) + canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0) + return canvas def main(): @@ -108,78 +199,18 @@ def main(): pred = keypoint_model.predict() return pred - ori_img = cv2.imread(args.image) - data, im_info = DetEvaluator.process_inputs( - ori_img.copy(), - detector.cfg.test_image_short_size, - detector.cfg.test_image_max_size, - ) - detector.inputs["im_info"].set_value(im_info) - detector.inputs["image"].set_value(data.astype(np.float32)) + evaluator = KeypointEvaluator(detector, det_func, keypoint_model, keypoint_func) - logger.info("Detecting Humans") - evaluator = DetEvaluator(detector) - det_res = evaluator.predict(det_func) + image = cv2.imread(args.image) - normalized_img = (ori_img - np.array(cfg.IMG_MEAN).reshape(1, 1, 3)) / np.array( - cfg.IMG_STD - ).reshape(1, 1, 3) + logger.info("Detecting Humans") + person_boxes = evaluator.detect_persons(image) logger.info("Detecting Keypoints") - all_keypoints = [] - for det in det_res: - cls_id = int(det[5] + 1) - if cls_id == 1: - bbox = det[:4] - w = bbox[2] - bbox[0] - h = bbox[3] - bbox[1] - - center_x = (bbox[0] + bbox[2]) / 2 - center_y = (bbox[1] + bbox[3]) / 2 - - extend_w = w * (1 + cfg.test_x_ext) - extend_h = h * (1 + cfg.test_y_ext) - - w_h_ratio = cfg.input_shape[1] / cfg.input_shape[0] - if extend_w / extend_h > w_h_ratio: - extend_h = extend_w / w_h_ratio - else: - extend_w = extend_h * w_h_ratio - - trans = get_affine_transform( - np.array([center_x, center_y]), - np.array([extend_h, extend_w]), - 1, - 0, - cfg.input_shape, - ) - - croped_img = cv2.warpAffine( - normalized_img, - trans, - (int(cfg.input_shape[1]), int(cfg.input_shape[0])), - flags=cv2.INTER_LINEAR, - borderValue=0, - ) - - fliped_img = croped_img[:, ::-1] - keypoint_input = np.stack([croped_img, fliped_img], 0) - keypoint_input = keypoint_input.transpose(0, 3, 1, 2) - keypoint_input = np.ascontiguousarray(keypoint_input).astype(np.float32) - - keypoint_model.inputs["image"].set_value(keypoint_input) - - outs = keypoint_func() - outs = outs.numpy() - pred = outs[0] - fliped_pred = outs[1][cfg.keypoint_flip_order][:, :, ::-1] - pred = (pred + fliped_pred) / 2 - - keypoints = find_keypoints(pred, bbox) - all_keypoints.append(keypoints) + all_keypoints = evaluator.predict(image, person_boxes) logger.info("Visualizing") - canvas = vis_skeleton(ori_img, all_keypoints) + canvas = evaluator.vis_skeletons(image, all_keypoints) cv2.imwrite("vis_skeleton.jpg", canvas) diff --git a/official/vision/keypoints/models/__init__.py b/official/vision/keypoints/models/__init__.py index 53bb5936b5c59c77bfea56da9dd3efe0cee4c117..3fd84567e9f6d240d73b154059e1e5d05e97d30a 100644 --- a/official/vision/keypoints/models/__init__.py +++ b/official/vision/keypoints/models/__init__.py @@ -11,3 +11,5 @@ from .simplebaseline import ( simplebaseline_res101, simplebaseline_res152, ) + +from .mspn import mspn_4stage diff --git a/official/vision/keypoints/models/mspn.py b/official/vision/keypoints/models/mspn.py new file mode 100644 index 0000000000000000000000000000000000000000..4c9ed0fe695f859300f133f83b8d26882a02597a --- /dev/null +++ b/official/vision/keypoints/models/mspn.py @@ -0,0 +1,258 @@ +# -*- coding: utf-8 -*- +# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") +# +# Copyright (c) 2014-2020 Megvii Inc. All rights reserved. +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +import megengine as mge +import megengine.functional as F +import megengine.hub as hub +import megengine.module as M +import math +import official.vision.classification.resnet.model as resnet + +import numpy as np + + +class ResnetBody(M.Module): + def __init__( + self, + block, + init_channel, + layers, + channels, + zero_init_residual=False, + norm=M.BatchNorm2d, + ): + super(ResnetBody, self).__init__() + self.in_channels = init_channel + self.layer1 = self._make_layer( + block, channels[0], layers[0], stride=1, norm=norm + ) + + self.layer2 = self._make_layer( + block, channels[1], layers[1], stride=2, norm=norm + ) + + self.layer3 = self._make_layer( + block, channels[2], layers[2], stride=2, norm=norm, + ) + + self.layer4 = self._make_layer( + block, channels[3], layers[3], stride=2, norm=norm, + ) + + for m in self.modules(): + if isinstance(m, M.Conv2d): + M.init.msra_normal_(m.weight, mode="fan_out", nonlinearity="relu") + if m.bias is not None: + fan_in, _ = M.init.calculate_fan_in_and_fan_out(m.weight) + bound = 1 / math.sqrt(fan_in) + M.init.uniform_(m.bias, -bound, bound) + elif isinstance(m, M.BatchNorm2d): + M.init.ones_(m.weight) + M.init.zeros_(m.bias) + elif isinstance(m, M.Linear): + M.init.msra_uniform_(m.weight, a=math.sqrt(5)) + if m.bias is not None: + fan_in, _ = M.init.calculate_fan_in_and_fan_out(m.weight) + bound = 1 / math.sqrt(fan_in) + M.init.uniform_(m.bias, -bound, bound) + + def _make_layer(self, block, channels, blocks, stride=1, norm=M.BatchNorm2d): + layers = [] + layers.append(block(self.in_channels, channels, stride, norm=norm)) + self.in_channels = channels * block.expansion + for _ in range(1, blocks): + layers.append(block(self.in_channels, channels, norm=norm)) + + return M.Sequential(*layers) + + def forward(self, x): + outputs = [] + + x = self.layer1(x) + outputs.append(x) + x = self.layer2(x) + outputs.append(x) + x = self.layer3(x) + outputs.append(x) + x = self.layer4(x) + outputs.append(x) + + return outputs + + +class SingleStage(M.Module): + def __init__( + self, block, init_channel, layers, channels, mid_channel, norm=M.BatchNorm2d + ): + super(SingleStage, self).__init__() + self.down = ResnetBody(block, init_channel, layers, channels, norm) + channel = block.expansion * channels[-1] + self.up1 = M.Sequential( + M.Conv2d(channel, mid_channel, 1, 1, 0), norm(mid_channel) + ) + self.deconv1 = M.Sequential( + M.ConvTranspose2d(mid_channel, mid_channel, 4, 2, 1), norm(mid_channel) + ) + + channel = block.expansion * channels[-2] + self.up2 = M.Sequential( + M.Conv2d(channel, mid_channel, 1, 1, 0), norm(mid_channel) + ) + self.deconv2 = M.Sequential( + M.ConvTranspose2d(mid_channel, mid_channel, 4, 2, 1), norm(mid_channel) + ) + + channel = block.expansion * channels[-3] + self.up3 = M.Sequential( + M.Conv2d(channel, mid_channel, 1, 1, 0), norm(mid_channel) + ) + self.deconv3 = M.Sequential( + M.ConvTranspose2d(mid_channel, mid_channel, 4, 2, 1), norm(mid_channel) + ) + + channel = block.expansion * channels[-4] + self.up4 = M.Sequential( + M.Conv2d(channel, mid_channel, 1, 1, 0), norm(mid_channel) + ) + + def forward(self, x): + branches = self.down(x) + branches = list(reversed(branches)) + + outputs = [] + f_up = F.relu(self.up1(branches[0])) + + outputs.append(f_up) + + f = self.up2(branches[1]) + f_up = F.relu(self.deconv1(f_up) + f) + outputs.append(f_up) + + f = self.up3(branches[2]) + f_up = F.relu(self.deconv2(f_up) + f) + outputs.append(f_up) + + f = self.up4(branches[3]) + f_up = F.relu(self.deconv3(f_up) + f) + outputs.append(f_up) + + return outputs + + +class MSPN(M.Module): + def __init__(self, block, layers, channels, mid_channel, keypoint_num, nr_stg): + super(MSPN, self).__init__() + + block = getattr(resnet, block) + norm = M.BatchNorm2d + + self.nr_stg = nr_stg + self.keypoint_num = keypoint_num + + self.head = M.Sequential( + M.Conv2d(3, 64, 3, 2, 1), + norm(64), + M.ReLU(), + M.Conv2d(64, 64, 3, 1, 1), + norm(64), + M.ReLU(), + M.Conv2d(64, 64, 3, 2, 1), + norm(64), + M.ReLU(), + ) + + self.stages = {} + for i in range(nr_stg): + init_channel = 64 + self.stages["Stage_{}_body".format(i)] = SingleStage( + block, init_channel, layers, channels, mid_channel, norm + ) + tail = {} + for j in range(4): + tail["tail_{}".format(j)] = M.Conv2d(mid_channel, keypoint_num, 3, 1, 1) + self.stages["Stage_{}_tail".format(i)] = tail + + if i < nr_stg - 1: + self.stages["Stage_{}_next".format(i)] = M.Sequential( + M.Conv2d(mid_channel, 64, 1, 1, 0), norm(64), M.ReLU() + ) + + self.inputs = { + "image": mge.tensor(dtype="float32"), + "heatmap": mge.tensor(dtype="float32"), + "heat_valid": mge.tensor(dtype="float32"), + } + + def calc_loss(self): + outs = self.forward(self.inputs["image"]) + + loss = 0 + for stage_out in outs: + for ind, scale_out in enumerate(stage_out[:-1]): + label = ( + self.inputs["heatmap"][:, ind] + * (self.inputs["heat_valid"] > 1.1)[:, :, None, None] + ) + tmp = F.square_loss(scale_out, label) + loss += tmp / 4 / len(outs) + + # OHKM loss for the largest heatmap + tmp = ((stage_out[-1] - self.inputs["heatmap"][:, -1]) ** 2).mean(3).mean( + 2 + ) * (self.inputs["heat_valid"] > 0.1) + ohkm_loss = 0 + for i in range(tmp.shape[0]): + selected_loss, _ = F.top_k( + tmp[i], self.keypoint_num // 2, descending=True + ) + ohkm_loss += selected_loss.mean() + ohkm_loss /= tmp.shape[0] + loss += ohkm_loss + return loss + + def predict(self): + outputs = self.forward(self.inputs["image"]) + pred = outputs[-1][-1] + return pred + + def forward(self, x): + + f = self.head(x) + outputs = [] + for i in range(self.nr_stg): + multi_scale_features = self.stages["Stage_{}_body".format(i)](f) + + multi_scale_heatmaps = [] + for j in range(4): + out = self.stages["Stage_{}_tail".format(i)]["tail_{}".format(j)]( + multi_scale_features[j] + ) + out = F.interpolate(out, scale_factor=2 ** (3 - j)) + multi_scale_heatmaps.append(out) + + if i < self.nr_stg - 1: + f = self.stages["Stage_{}_next".format(i)](multi_scale_features[-1]) + + outputs.append(multi_scale_heatmaps) + return outputs + + +@hub.pretrained( + "https://data.megengine.org.cn/models/weights/mspn_4stage_256x192_0_255_75_2.pkl" +) +def mspn_4stage(**kwargs): + model = MSPN( + block="Bottleneck", + layers=[5, 5, 6, 3], + channels=[64, 128, 192, 384], + nr_stg=4, + mid_channel=256, + keypoint_num=17, + **kwargs + ) + return model diff --git a/official/vision/keypoints/models/simplebaseline.py b/official/vision/keypoints/models/simplebaseline.py index 6b9d7dcf08f993c107a48cebbbbc5fbbd19d4620..42206988e6365dd9f861eea4b0102efc95e4ac2e 100644 --- a/official/vision/keypoints/models/simplebaseline.py +++ b/official/vision/keypoints/models/simplebaseline.py @@ -13,7 +13,6 @@ import megengine.module as M import official.vision.classification.resnet.model as resnet import numpy as np -from functools import partial class DeconvLayers(M.Module): @@ -38,10 +37,12 @@ class DeconvLayers(M.Module): class SimpleBaseline(M.Module): - def __init__(self, backbone, cfg, pretrained=False): - - norm = partial(M.BatchNorm2d, momentum=cfg.bn_momentum) - self.backbone = getattr(resnet, backbone)(norm=norm, pretrained=pretrained) + def __init__(self, backbone, cfg): + super(SimpleBaseline, self).__init__() + norm = M.BatchNorm2d + self.backbone = getattr(resnet, backbone)( + norm=norm, pretrained=cfg.backbone_pretrained + ) del self.backbone.fc self.cfg = cfg @@ -67,7 +68,7 @@ class SimpleBaseline(M.Module): def calc_loss(self): out = self.forward(self.inputs["image"]) valid = self.inputs["heat_valid"][:, :, None, None] - label = self.inputs["heatmap"][:, 0] + label = self.inputs["heatmap"][:, -1] loss = F.square_loss(out * valid, label * valid) return loss @@ -101,8 +102,8 @@ class SimpleBaseline_Config: deconv_channels = [256, 256, 256] deconv_kernel_sizes = [4, 4, 4] deconv_with_bias = False - bn_momentum = 0.1 keypoint_num = 17 + backbone_pretrained = True cfg = SimpleBaseline_Config() diff --git a/official/vision/keypoints/test.py b/official/vision/keypoints/test.py index 1871b8b8d398128f46992564670c63e69c946983..460ab8811d6aa8039a47a7b8507d65243660e9bb 100644 --- a/official/vision/keypoints/test.py +++ b/official/vision/keypoints/test.py @@ -34,7 +34,7 @@ logger = mge.get_logger(__name__) def build_dataloader(rank, world_size, data_root, ann_file): val_dataset = COCOJoints( - data_root, ann_file, image_set="val", order=("image", "boxes", "info") + data_root, ann_file, image_set="val2017", order=("image", "boxes", "info") ) val_sampler = SequentialSampler(val_dataset, 1, world_size=world_size, rank=rank) val_dataloader = DataLoader( @@ -43,7 +43,7 @@ def build_dataloader(rank, world_size, data_root, ann_file): num_workers=4, transform=T.Compose( transforms=[ - T.Normalize(mean=cfg.IMG_MEAN, std=cfg.IMG_STD), + T.Normalize(mean=cfg.img_mean, std=cfg.img_std), ExtendBoxes( cfg.test_x_ext, cfg.test_y_ext, @@ -213,17 +213,10 @@ def worker( def make_parser(): parser = argparse.ArgumentParser() parser.add_argument("-n", "--ngpus", default=8, type=int) - parser.add_argument("-d", "--data_root", default="/", type=str) - parser.add_argument( - "-gt", - "--gt_path", - default="/data/coco/annotations/person_keypoints_val2017.json", - type=str, - ) parser.add_argument( "-dt", - "--dt_path", - default="/data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json", + "--dt_file", + default="COCO_val2017_detections_AP_H_56_person.json", type=str, ) parser.add_argument("-se", "--start_epoch", default=-1, type=int) @@ -237,6 +230,7 @@ def make_parser(): "simplebaseline_res50", "Simplebaseline_res101", "Simplebaseline_res152", + "mspn_4stage", ], ) parser.add_argument( @@ -255,8 +249,13 @@ def main(): parser = make_parser() args = parser.parse_args() - dets = json.load(open(args.dt_path, "r")) - eval_gt = COCO(args.gt_path) + dt_path = os.path.join(cfg.data_root, "person_detection_results", args.dt_file) + dets = json.load(open(dt_path, "r")) + + gt_path = os.path.join( + cfg.data_root, "annotations", "person_keypoints_val2017.json" + ) + eval_gt = COCO(gt_path) gt = eval_gt.dataset dets = [ @@ -285,7 +284,7 @@ def main(): args=( args.arch, model_file, - args.data_root, + cfg.data_root, ann_file, i, args.ngpus, diff --git a/official/vision/keypoints/train.py b/official/vision/keypoints/train.py index 763a84a9e472e12864d177c9b82a918722cfdb48..83d8590af2641bb22d81e4f8868e0dbf0344bbb6 100644 --- a/official/vision/keypoints/train.py +++ b/official/vision/keypoints/train.py @@ -47,21 +47,12 @@ def main(): "simplebaseline_res50", "simplebaseline_res101", "simplebaseline_res152", + "mspn_4stage", ], ) - parser.add_argument("--pretrained", default=True, type=bool) parser.add_argument("-s", "--save", default="/data/models", type=str) - parser.add_argument("--data_root", default="/data/coco/images/", type=str) - parser.add_argument( - "--ann_file", - default="/data/coco/annotations/person_keypoints_train2017.json", - type=str, - ) - parser.add_argument("--continue", default=None, type=str) - parser.add_argument("-b", "--batch_size", default=64, type=int) - parser.add_argument("--lr", default=6e-4, type=float) - parser.add_argument("--epochs", default=200, type=int) + parser.add_argument("--resume", default=None, type=str) parser.add_argument("--multi_scale_supervision", default=True, type=bool) @@ -81,7 +72,7 @@ def main(): if world_size > 1: # scale learning rate by number of gpus - args.lr *= world_size + cfg.initial_lr *= world_size # start distributed training, dispatch sub-processes processes = [] for rank in range(world_size): @@ -110,30 +101,35 @@ def worker(rank, world_size, args): model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1]) save_dir = os.path.join(args.save, model_name) - model = getattr(M, args.arch)(pretrained=args.pretrained) + model = getattr(M, args.arch)() model.train() start_epoch = 0 - if args.c is not None: - file = mge.load(args.c) + if args.resume is not None: + file = mge.load(args.resume) model.load_state_dict(file["state_dict"]) start_epoch = file["epoch"] optimizer = optim.Adam( - model.parameters(requires_grad=True), lr=args.lr, weight_decay=cfg.weight_decay, + model.parameters(requires_grad=True), + lr=cfg.initial_lr, + weight_decay=cfg.weight_decay, ) # Build train datasets logger.info("preparing dataset..") + ann_file = os.path.join( + cfg.data_root, "annotations", "person_keypoints_train2017.json" + ) train_dataset = COCOJoints( - args.data_root, - args.ann_file, - image_set="train", + cfg.data_root, + ann_file, + image_set="train2017", order=("image", "keypoints", "boxes", "info"), ) train_sampler = data.RandomSampler( - train_dataset, batch_size=args.batch_size, drop_last=True + train_dataset, batch_size=cfg.batch_size, drop_last=True ) - transforms = [T.Normalize(mean=cfg.IMG_MEAN, std=cfg.IMG_STD)] + transforms = [T.Normalize(mean=cfg.img_mean, std=cfg.img_std)] if cfg.half_body_transform: transforms.append( HalfBodyTransform( @@ -167,14 +163,14 @@ def worker(rank, world_size, args): cfg.input_shape, cfg.output_shape, cfg.keypoint_num, - cfg.heat_thre, + cfg.heat_thr, cfg.heat_kernel if args.multi_scale_supervision else cfg.heat_kernel[-1:], cfg.heat_range, ), ) # Start training - for epoch in range(start_epoch, args.epochs): + for epoch in range(start_epoch, cfg.epochs): loss = train(model, train_queue, optimizer, args, epoch=epoch) logger.info("Epoch %d Train %.6f ", epoch, loss) @@ -208,10 +204,10 @@ def train(model, data_queue, optimizer, args, epoch=0): ) * current_step / cfg.warm_epochs / len(data_queue) else: lr_factor = 1 - (current_step - len(data_queue) * cfg.warm_epochs) / ( - len(data_queue) * (args.epochs - cfg.warm_epochs) + len(data_queue) * (cfg.epochs - cfg.warm_epochs) ) - lr = args.initial_lr * lr_factor + lr = cfg.initial_lr * lr_factor param_group["lr"] = lr lr = optimizer.param_groups[0]["lr"]