Add deep mutual learning (#302)

358ca21d · Bai Yifan · GitHub · e9c8d9bf · 358ca21d · 358ca21d
8 changed file
--- a/demo/DML/README.md
+++ b/demo/DML/README.md
+# 深度互学习DML(Deep Mutual Learning)
+本示例介绍如何使用PaddleSlim的深度互学习DML方法训练模型，算法原理请参考论文 [Deep Mutual Learning](https://arxiv.org/abs/1706.00384)
+
+## 使用数据
+示例中使用cifar100数据集进行训练, 您可以在启动训练时等待自动下载，
+也可以在自行下载[数据集](https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz)之后，放在当前目录的`./dataset/cifar100`路径下
+
+## 启动命令
+
+单卡训练, 以0号GPU为例：
+```bash
+CUDA_VISIBLE_DEVICES=0 python dml_train.py
+```
+
+多卡训练, 以0-3号GPU为例:
+```bash
+python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog dml_train.py --use_parallel=True
+```
+
+## 实验结果
+
+以下实验结果可以由默认实验配置(学习率、优化器等)训练得到，仅调整了DML训练的模型组合
+
+如果想进一步提升实验结果可以尝试[更多优化tricks](https://arxiv.org/abs/1812.01187), 或进一步增加一次DML训练的模型数量。
+
+| 数据集 | 网络模型 |  单独训练准确率 | 深度互学习准确率 |
+| ------ | ------ | ------ | ------ |
+| CIFAR100 | MobileNet X 2 | 73.65% | 76.34% (+2.69%) |
+| CIFAR100 | MobileNet X 4 | 73.65% | 76.56% (+2.91%) |
+| CIFAR100 | MobileNet + ResNet50 | 73.65%/76.52% | 76.00%/77.80% (+2.35%/+1.28%) |
--- a/demo/DML/cifar100_reader.py
+++ b/demo/DML/cifar100_reader.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from PIL import Image
+from PIL import ImageOps
+import os
+import math
+import random
+import tarfile
+import functools
+import numpy as np
+from PIL import Image, ImageEnhance
+import paddle
+# for python2/python3 compatiablity
+try:
+    import cPickle
+except:
+    import _pickle as cPickle
+
+IMAGE_SIZE = 32
+IMAGE_DEPTH = 3
+CIFAR_MEAN = [0.5070751592371323, 0.48654887331495095, 0.4409178433670343]
+CIFAR_STD = [0.2673342858792401, 0.2564384629170883, 0.27615047132568404]
+
+URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
+CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
+CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
+paddle.dataset.common.DATA_HOME = "dataset/"
+
+
+def preprocess(sample, is_training):
+    image_array = sample.reshape(IMAGE_DEPTH, IMAGE_SIZE, IMAGE_SIZE)
+    rgb_array = np.transpose(image_array, (1, 2, 0))
+    img = Image.fromarray(rgb_array, 'RGB')
+
+    if is_training:
+        # pad, ramdom crop, random_flip_left_right, random_rotation
+        img = ImageOps.expand(img, (4, 4, 4, 4), fill=0)
+        left_top = np.random.randint(8, size=2)
+        img = img.crop((left_top[1], left_top[0], left_top[1] + IMAGE_SIZE,
+                        left_top[0] + IMAGE_SIZE))
+        if np.random.randint(2):
+            img = img.transpose(Image.FLIP_LEFT_RIGHT)
+        random_angle = np.random.randint(-15, 15)
+        img = img.rotate(random_angle, Image.NEAREST)
+    img = np.array(img).astype(np.float32)
+
+    img_float = img / 255.0
+    img = (img_float - CIFAR_MEAN) / CIFAR_STD
+
+    img = np.transpose(img, (2, 0, 1))
+    return img
+
+
+def reader_generator(datasets, batch_size, is_training, is_shuffle):
+    def read_batch(datasets):
+        if is_shuffle:
+            random.shuffle(datasets)
+        for im, label in datasets:
+            im = preprocess(im, is_training)
+            yield im, [int(label)]
+
+    def reader():
+        batch_data = []
+        batch_label = []
+        for data in read_batch(datasets):
+            batch_data.append(data[0])
+            batch_label.append(data[1])
+            if len(batch_data) == batch_size:
+                batch_data = np.array(batch_data, dtype='float32')
+                batch_label = np.array(batch_label, dtype='int64')
+                batch_out = [batch_data, batch_label]
+                yield batch_out
+                batch_data = []
+                batch_label = []
+
+    return reader
+
+
+def cifar100_reader(file_name, data_name, is_shuffle):
+    with tarfile.open(file_name, mode='r') as f:
+        names = [
+            each_item.name for each_item in f if data_name in each_item.name
+        ]
+        names.sort()
+        datasets = []
+        for name in names:
+            print("Reading file " + name)
+            try:
+                batch = cPickle.load(
+                    f.extractfile(name), encoding='iso-8859-1')
+            except:
+                batch = cPickle.load(f.extractfile(name))
+            data = batch['data']
+            labels = batch.get('labels', batch.get('fine_labels', None))
+            assert labels is not None
+            dataset = zip(data, labels)
+            datasets.extend(dataset)
+        if is_shuffle:
+            random.shuffle(datasets)
+    return datasets
+
+
+def train_valid(batch_size, is_train, is_shuffle):
+    name = 'train' if is_train else 'test'
+    datasets = cifar100_reader(
+        paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        name, is_shuffle)
+    reader = reader_generator(datasets, batch_size, is_train, is_shuffle)
+    return reader
--- a/demo/DML/dml_train.py
+++ b/demo/DML/dml_train.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import argparse
+import functools
+import logging
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.base import to_variable
+from paddleslim.common import AvgrageMeter, get_logger
+from paddleslim.dist import DML
+from paddleslim.models.dygraph import MobileNetV1
+import cifar100_reader as reader
+sys.path[0] = os.path.join(os.path.dirname("__file__"), os.path.pardir)
+from utility import add_arguments, print_arguments
+
+logger = get_logger(__name__, level=logging.INFO)
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+
+# yapf: disable
+add_arg('log_freq',          int,   100,              "Log frequency.")
+add_arg('batch_size',        int,   256,             "Minibatch size.")
+add_arg('init_lr',           float, 0.1,             "The start learning rate.")
+add_arg('use_gpu',           bool,  True,            "Whether use GPU.")
+add_arg('epochs',            int,   200,             "Epoch number.")
+add_arg('class_num',         int,   100,             "Class number of dataset.")
+add_arg('trainset_num',      int,   50000,           "Images number of trainset.")
+add_arg('model_save_dir',    str,   'saved_models',  "The path to save model.")
+add_arg('use_multiprocess',  bool,  True,            "Whether use multiprocess reader.")
+add_arg('use_parallel',      bool,  False,           "Whether to use data parallel mode to train the model.")
+# yapf: enable
+
+
+def create_optimizer(models, args):
+    device_num = fluid.dygraph.parallel.Env().nranks
+    step = int(args.trainset_num / (args.batch_size * device_num))
+    epochs = [60, 120, 180]
+    bd = [step * e for e in epochs]
+    lr = [args.init_lr * (0.1**i) for i in range(len(bd) + 1)]
+
+    optimizers = []
+    for cur_model in models:
+        learning_rate = fluid.dygraph.PiecewiseDecay(bd, lr, 0)
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate,
+            0.9,
+            parameter_list=cur_model.parameters(),
+            use_nesterov=True,
+            regularization=fluid.regularizer.L2DecayRegularizer(5e-4))
+        optimizers.append(opt)
+    return optimizers
+
+
+def create_reader(place, args):
+    train_reader = reader.train_valid(
+        batch_size=args.batch_size, is_train=True, is_shuffle=True)
+    valid_reader = reader.train_valid(
+        batch_size=args.batch_size, is_train=False, is_shuffle=False)
+    if args.use_parallel:
+        train_reader = fluid.contrib.reader.distributed_batch_reader(
+            train_reader)
+    train_loader = fluid.io.DataLoader.from_generator(
+        capacity=1024,
+        return_list=True,
+        use_multiprocess=args.use_multiprocess)
+    valid_loader = fluid.io.DataLoader.from_generator(
+        capacity=1024,
+        return_list=True,
+        use_multiprocess=args.use_multiprocess)
+    train_loader.set_batch_generator(train_reader, places=place)
+    valid_loader.set_batch_generator(valid_reader, places=place)
+    return train_loader, valid_loader
+
+
+def train(train_loader, dml_model, dml_optimizer, args):
+    dml_model.train()
+    costs = [AvgrageMeter() for i in range(dml_model.model_num)]
+    accs = [AvgrageMeter() for i in range(dml_model.model_num)]
+    for step_id, (images, labels) in enumerate(train_loader):
+        images, labels = to_variable(images), to_variable(labels)
+        batch_size = images.shape[0]
+
+        logits = dml_model.forward(images)
+        precs = [
+            fluid.layers.accuracy(
+                input=l, label=labels, k=1) for l in logits
+        ]
+        losses = dml_model.loss(logits, labels)
+        dml_optimizer.minimize(losses)
+
+        for i in range(dml_model.model_num):
+            accs[i].update(precs[i].numpy(), batch_size)
+            costs[i].update(losses[i].numpy(), batch_size)
+        model_names = dml_model.full_name()
+        if step_id % args.log_freq == 0:
+            log_msg = "Train Step {}".format(step_id)
+            for model_id, (cost, acc) in enumerate(zip(costs, accs)):
+                log_msg += ", {} loss: {:.6f} acc: {:.6f}".format(
+                    model_names[model_id], cost.avg[0], acc.avg[0])
+            logger.info(log_msg)
+    return costs, accs
+
+
+def valid(valid_loader, dml_model, args):
+    dml_model.eval()
+    costs = [AvgrageMeter() for i in range(dml_model.model_num)]
+    accs = [AvgrageMeter() for i in range(dml_model.model_num)]
+    for step_id, (images, labels) in enumerate(valid_loader):
+        images, labels = to_variable(images), to_variable(labels)
+        batch_size = images.shape[0]
+
+        logits = dml_model.forward(images)
+        precs = [
+            fluid.layers.accuracy(
+                input=l, label=labels, k=1) for l in logits
+        ]
+        losses = dml_model.loss(logits, labels)
+
+        for i in range(dml_model.model_num):
+            accs[i].update(precs[i].numpy(), batch_size)
+            costs[i].update(losses[i].numpy(), batch_size)
+        model_names = dml_model.full_name()
+        if step_id % args.log_freq == 0:
+            log_msg = "Valid Step{} ".format(step_id)
+            for model_id, (cost, acc) in enumerate(zip(costs, accs)):
+                log_msg += ", {} loss: {:.6f} acc: {:.6f}".format(
+                    model_names[model_id], cost.avg[0], acc.avg[0])
+            logger.info(log_msg)
+    return costs, accs
+
+
+def main(args):
+    if not args.use_gpu:
+        place = fluid.CPUPlace()
+    elif not args.use_parallel:
+        place = fluid.CUDAPlace(0)
+    else:
+        place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id)
+
+    with fluid.dygraph.guard(place):
+        # 1. Define data reader
+        train_loader, valid_loader = create_reader(place, args)
+
+        # 2. Define neural network
+        models = [
+            MobileNetV1(class_dim=args.class_num),
+            MobileNetV1(class_dim=args.class_num)
+        ]
+        optimizers = create_optimizer(models, args)
+
+        # 3. Use PaddleSlim DML strategy
+        dml_model = DML(models, args.use_parallel)
+        dml_optimizer = dml_model.opt(optimizers)
+
+        # 4. Train your network
+        save_parameters = (not args.use_parallel) or (
+            args.use_parallel and fluid.dygraph.parallel.Env().local_rank == 0)
+        best_valid_acc = [0] * dml_model.model_num
+        for epoch_id in range(args.epochs):
+            current_step_lr = dml_optimizer.get_lr()
+            lr_msg = "Epoch {}".format(epoch_id)
+            for model_id, lr in enumerate(current_step_lr):
+                lr_msg += ", {} lr: {:.6f}".format(
+                    dml_model.full_name()[model_id], lr)
+            logger.info(lr_msg)
+            train_losses, train_accs = train(train_loader, dml_model,
+                                             dml_optimizer, args)
+            valid_losses, valid_accs = valid(valid_loader, dml_model, args)
+            for i in range(dml_model.model_num):
+                if valid_accs[i].avg[0] > best_valid_acc[i]:
+                    best_valid_acc[i] = valid_accs[i].avg[0]
+                    if save_parameters:
+                        fluid.save_dygraph(
+                            models[i].state_dict(),
+                            os.path.join(args.model_save_dir,
+                                         dml_model.full_name()[i],
+                                         "best_model"))
+                summery_msg = "Epoch {} {}: valid_loss {:.6f}, valid_acc {:.6f}, best_valid_acc {:.6f}"
+                logger.info(
+                    summery_msg.format(epoch_id,
+                                       dml_model.full_name()[i], valid_losses[
+                                           i].avg[0], valid_accs[i].avg[0],
+                                       best_valid_acc[i]))
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print_arguments(args)
+    main(args)
--- a/paddleslim/dist/__init__.py
+++ b/paddleslim/dist/__init__.py
@@ -13,3 +13,4 @@
 # limitations under the License.

 from .single_distiller import merge, fsp_loss, l2_loss, soft_label_loss, loss
+from .dml import DML
--- a/paddleslim/dist/dml.py
+++ b/paddleslim/dist/dml.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import paddle.fluid as fluid
+import paddle.nn.functional as F
+
+
+class DML(fluid.dygraph.Layer):
+    def __init__(self, model, use_parallel):
+        super(DML, self).__init__()
+        self.model = model
+        self.use_parallel = use_parallel
+        self.model_num = len(self.model)
+        if self.use_parallel:
+            strategy = fluid.dygraph.parallel.prepare_context()
+            self.model = [
+                fluid.dygraph.parallel.DataParallel(m, strategy)
+                for m in self.model
+            ]
+
+    def full_name(self):
+        return [m.full_name() for m in self.model]
+
+    def forward(self, input):
+        return [m(input) for m in self.model]
+
+    def opt(self, optimizer):
+        assert len(
+            optimizer
+        ) == self.model_num, "The number of optimizers must match the number of models"
+        optimizer = DMLOptimizers(self.model, optimizer, self.use_parallel)
+        return optimizer
+
+    def ce_loss(self, logits, labels):
+        assert len(
+            logits
+        ) == self.model_num, "The number of logits must match the number of models"
+        ce_losses = []
+        for i in range(self.model_num):
+            ce_losses.append(
+                fluid.layers.mean(
+                    fluid.layers.softmax_with_cross_entropy(logits[i],
+                                                            labels)))
+        return ce_losses
+
+    def kl_loss(self, logits):
+        assert len(
+            logits
+        ) == self.model_num, "The number of logits must match the number of models"
+        if self.model_num == 1:
+            return []
+        kl_losses = []
+        for i in range(self.model_num):
+            cur_kl_loss = 0
+            for j in range(self.model_num):
+                if i != j:
+                    x = F.log_softmax(logits[i], axis=1)
+                    y = fluid.layers.softmax(logits[j], axis=1)
+                    cur_kl_loss += fluid.layers.kldiv_loss(
+                        x, y, reduction='batchmean')
+            kl_losses.append(cur_kl_loss / (self.model_num - 1))
+        return kl_losses
+
+    def loss(self, logits, labels):
+        gt_losses = self.ce_loss(logits, labels)
+        kl_losses = self.kl_loss(logits)
+        if self.model_num > 1:
+            return [a + b for a, b in zip(gt_losses, kl_losses)]
+        else:
+            return gt_losses
+
+    def acc(self, logits, labels, k):
+        accs = [
+            fluid.layers.accuracy(
+                input=l, label=labels, k=k) for l in logits
+        ]
+        return accs
+
+    def train(self):
+        for m in self.model:
+            m.train()
+
+    def eval(self):
+        for m in self.model:
+            m.eval()
+
+
+class DMLOptimizers(object):
+    def __init__(self, model, optimizer, use_parallel):
+        self.model = model
+        self.optimizer = optimizer
+        self.use_parallel = use_parallel
+
+    def minimize(self, losses):
+        assert len(losses) == len(
+            self.optimizer
+        ), "The number of losses must match the number of optimizers"
+        for i in range(len(losses)):
+            if self.use_parallel:
+                losses[i] = self.model[i].scale_loss(losses[i])
+                losses[i].backward()
+                self.model[i].apply_collective_grads()
+            else:
+                losses[i].backward()
+            self.optimizer[i].minimize(losses[i])
+            self.model[i].clear_gradients()
+
+    def get_lr(self):
+        current_step_lr = [opt.current_step_lr() for opt in self.optimizer]
+        return current_step_lr
--- a/paddleslim/models/dygraph/__init__.py
+++ b/paddleslim/models/dygraph/__init__.py
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from .mobilenet import MobileNetV1
+from .resnet import ResNet
+
+__all__ = ["MobileNetV1", "ResNet"]
--- a/paddleslim/models/dygraph/mobilenet.py
+++ b/paddleslim/models/dygraph/mobilenet.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#order: standard library, third party, local library 
+import os
+import time
+import sys
+import math
+import numpy as np
+import argparse
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+from paddle.fluid.dygraph.base import to_variable
+from paddle.fluid import framework
+
+
+class ConvBNLayer(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 padding,
+                 channels=None,
+                 num_groups=1,
+                 act='relu',
+                 use_cudnn=True,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(
+                initializer=MSRA(), name=self.full_name() + "_weights"),
+            bias_attr=False)
+
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
+            bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"),
+            moving_mean_name=self.full_name() + "_bn" + '_mean',
+            moving_variance_name=self.full_name() + "_bn" + '_variance')
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class DepthwiseSeparable(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters1,
+                 num_filters2,
+                 num_groups,
+                 stride,
+                 scale,
+                 name=None):
+        super(DepthwiseSeparable, self).__init__()
+
+        self._depthwise_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=int(num_filters1 * scale),
+            filter_size=3,
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups * scale),
+            use_cudnn=False)
+
+        self._pointwise_conv = ConvBNLayer(
+            num_channels=int(num_filters1 * scale),
+            filter_size=1,
+            num_filters=int(num_filters2 * scale),
+            stride=1,
+            padding=0)
+
+    def forward(self, inputs):
+        y = self._depthwise_conv(inputs)
+        y = self._pointwise_conv(y)
+        return y
+
+
+class MobileNetV1(fluid.dygraph.Layer):
+    def __init__(self, scale=1.0, class_dim=100):
+        super(MobileNetV1, self).__init__()
+        self.scale = scale
+        self.dwsl = []
+
+        self.conv1 = ConvBNLayer(
+            num_channels=3,
+            filter_size=3,
+            channels=3,
+            num_filters=int(32 * scale),
+            stride=1,
+            padding=1)
+
+        dws21 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(32 * scale),
+                num_filters1=32,
+                num_filters2=64,
+                num_groups=32,
+                stride=1,
+                scale=scale),
+            name="conv2_1")
+        self.dwsl.append(dws21)
+
+        dws22 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(64 * scale),
+                num_filters1=64,
+                num_filters2=128,
+                num_groups=64,
+                stride=1,
+                scale=scale),
+            name="conv2_2")
+        self.dwsl.append(dws22)
+
+        dws31 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(128 * scale),
+                num_filters1=128,
+                num_filters2=128,
+                num_groups=128,
+                stride=1,
+                scale=scale),
+            name="conv3_1")
+        self.dwsl.append(dws31)
+
+        dws32 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(128 * scale),
+                num_filters1=128,
+                num_filters2=256,
+                num_groups=128,
+                stride=2,
+                scale=scale),
+            name="conv3_2")
+        self.dwsl.append(dws32)
+
+        dws41 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(256 * scale),
+                num_filters1=256,
+                num_filters2=256,
+                num_groups=256,
+                stride=1,
+                scale=scale),
+            name="conv4_1")
+        self.dwsl.append(dws41)
+
+        dws42 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(256 * scale),
+                num_filters1=256,
+                num_filters2=512,
+                num_groups=256,
+                stride=2,
+                scale=scale),
+            name="conv4_2")
+        self.dwsl.append(dws42)
+
+        for i in range(5):
+            tmp = self.add_sublayer(
+                sublayer=DepthwiseSeparable(
+                    num_channels=int(512 * scale),
+                    num_filters1=512,
+                    num_filters2=512,
+                    num_groups=512,
+                    stride=1,
+                    scale=scale),
+                name="conv5_" + str(i + 1))
+            self.dwsl.append(tmp)
+
+        dws56 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(512 * scale),
+                num_filters1=512,
+                num_filters2=1024,
+                num_groups=512,
+                stride=2,
+                scale=scale),
+            name="conv5_6")
+        self.dwsl.append(dws56)
+
+        dws6 = self.add_sublayer(
+            sublayer=DepthwiseSeparable(
+                num_channels=int(1024 * scale),
+                num_filters1=1024,
+                num_filters2=1024,
+                num_groups=1024,
+                stride=1,
+                scale=scale),
+            name="conv6")
+        self.dwsl.append(dws6)
+
+        self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
+
+        self.out = Linear(
+            int(1024 * scale),
+            class_dim,
+            param_attr=ParamAttr(
+                initializer=MSRA(), name=self.full_name() + "fc7_weights"),
+            bias_attr=ParamAttr(name=self.full_name() + "fc7_offset"))
+
+    def forward(self, inputs):
+        y = self.conv1(inputs)
+        for dws in self.dwsl:
+            y = dws(y)
+
+        y = self.pool2d_avg(y)
+        y = fluid.layers.reshape(y, shape=[-1, 1024])
+        y = self.out(y)
+
+        return y
--- a/paddleslim/models/dygraph/resnet.py
+++ b/paddleslim/models/dygraph/resnet.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+
+
+class ConvBNLayer(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            bias_attr=False)
+
+        self._batch_norm = BatchNorm(num_filters, act=act)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+
+        return y
+
+
+class BottleneckBlock(fluid.dygraph.Layer):
+    def __init__(self, num_channels, num_filters, stride, shortcut=True):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu')
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu')
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride)
+
+        self.shortcut = shortcut
+
+        self._num_channels_out = num_filters * 4
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = fluid.layers.elementwise_add(x=short, y=conv2)
+
+        layer_helper = LayerHelper(self.full_name(), act='relu')
+        return layer_helper.append_activation(y)
+
+
+class ResNet(fluid.dygraph.Layer):
+    def __init__(self, layers=50, class_dim=100):
+        super(ResNet, self).__init__()
+
+        self.layers = layers
+        supported_layers = [34, 50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_channels = [64, 256, 512, 1024]
+        num_filters = [64, 128, 256, 512]
+
+        self.conv = ConvBNLayer(
+            num_channels=3,
+            num_filters=64,
+            filter_size=7,
+            stride=1,
+            act='relu')
+        self.pool2d_max = Pool2D(
+            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+
+        self.bottleneck_block_list = []
+        for block in range(len(depth)):
+            shortcut = False
+            for i in range(depth[block]):
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels=num_channels[block]
+                        if i == 0 else num_filters[block] * 4,
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        shortcut=shortcut))
+                self.bottleneck_block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = Pool2D(
+            pool_size=7, pool_type='avg', global_pooling=True)
+
+        self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 4 * 1 * 1
+
+        import math
+        stdv = 1.0 / math.sqrt(2048 * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_output,
+            class_dim,
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv)))
+
+    def forward(self, inputs):
+        y = self.conv(inputs)
+        for bottleneck_block in self.bottleneck_block_list:
+            y = bottleneck_block(y)
+        y = self.pool2d_avg(y)
+        y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output])
+        y = self.out(y)
+        return y