add grad clip

ccd7c40b · WenmuZhou · 913e11cb · ccd7c40b · ccd7c40b · ccd7c40b
4 changed file
--- a/doc/doc_ch/config.md
+++ b/doc/doc_ch/config.md
@@ -11,7 +11,7 @@
 ## 配置文件参数介绍

 以 `rec_chinese_lite_train_v1.1.yml ` 为例
-### Global 
+### Global

 |         字段             |            用途                |      默认值       |            备注            |
 | :----------------------: |  :---------------------:   | :--------------:  |   :--------------------:   |
@@ -42,6 +42,7 @@
 |      name        |         优化器类名          |  Adam  |  目前支持`Momentum`,`Adam`,`RMSProp`, 见[ppocr/optimizer/optimizer.py](../../ppocr/optimizer/optimizer.py)  |
 |      beta1           |    设置一阶矩估计的指数衰减率  |       0.9         |               \             |
 |      beta2           |    设置二阶矩估计的指数衰减率  |     0.999         |               \             |
+|      clip_norm           |    所允许的二范数最大值  |              |               \             |
 |      **lr**                |         设置学习率decay方式       |   -    |       \  |
 |        name    |      学习率decay类名   |         Cosine       | 目前支持`Linear`,`Cosine`,`Step`,`Piecewise`, 见[ppocr/optimizer/learning_rate.py](../../ppocr/optimizer/learning_rate.py) |
 |        learning_rate      |    基础学习率        |       0.001      |  \        |
@@ -119,4 +120,4 @@
 |      shuffle        |        每个epoch是否将数据集顺序打乱         |  True | \  |
 |      batch_size_per_card        |        训练时单卡batch size         |  256 | \  |
 |      drop_last        |        是否丢弃因数据集样本数不能被 batch_size 整除而产生的最后一个不完整的mini-batch        |  True | \  |
-|      num_workers        |        用于加载数据的子进程个数，若为0即为不开启子进程，在主进程中进行数据加载        |  8 | \  |
\ No newline at end of file
+|      num_workers        |        用于加载数据的子进程个数，若为0即为不开启子进程，在主进程中进行数据加载        |  8 | \  |
--- a/doc/doc_en/config_en.md
+++ b/doc/doc_en/config_en.md
@@ -10,7 +10,7 @@ The following list can be viewed through `--help`
 ## INTRODUCTION TO GLOBAL PARAMETERS OF CONFIGURATION FILE

 Take rec_chinese_lite_train_v1.1.yml as an example
-### Global 
+### Global

 |         Parameter             |            Use                |      Defaults       |            Note            |
 | :----------------------: |  :---------------------:   | :--------------:  |   :--------------------:   |
@@ -41,6 +41,7 @@ Take rec_chinese_lite_train_v1.1.yml as an example
 |      name        |         Optimizer class name          |  Adam  |  Currently supports`Momentum`,`Adam`,`RMSProp`, see [ppocr/optimizer/optimizer.py](../../ppocr/optimizer/optimizer.py)  |
 |      beta1           |    Set the exponential decay rate for the 1st moment estimates  |       0.9         |               \             |
 |      beta2           |    Set the exponential decay rate for the 2nd moment estimates  |     0.999         |               \             |
+|      clip_norm           |    The maximum norm value  |    -         |               \             |
 |      **lr**                |         Set the learning rate decay method       |   -    |       \  |
 |        name    |      Learning rate decay class name   |         Cosine       | Currently supports`Linear`,`Cosine`,`Step`,`Piecewise`, see[ppocr/optimizer/learning_rate.py](../../ppocr/optimizer/learning_rate.py) |
 |        learning_rate      |    Set the base learning rate        |       0.001      |  \        |
@@ -118,4 +119,4 @@ In ppocr, the network is divided into four stages: Transform, Backbone, Neck and
 |      shuffle        |        Does each epoch disrupt the order of the data set         |  True | \  |
 |      batch_size_per_card        |        Single card batch size during training         |  256 | \  |
 |      drop_last        |        Whether to discard the last incomplete mini-batch because the number of samples in the data set cannot be divisible by batch_size        |  True | \  |
-|      num_workers        |        The number of sub-processes used to load data, if it is 0, the sub-process is not started, and the data is loaded in the main process       |  8 | \  |
\ No newline at end of file
+|      num_workers        |        The number of sub-processes used to load data, if it is 0, the sub-process is not started, and the data is loaded in the main process       |  8 | \  |
--- a/ppocr/optimizer/__init__.py
+++ b/ppocr/optimizer/__init__.py
@@ -16,8 +16,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
-
 import copy
+import paddle

 __all__ = ['build_optimizer']

@@ -49,7 +49,13 @@ def build_optimizer(config, epochs, step_each_epoch, parameters):

    # step3 build optimizer
    optim_name = config.pop('name')
+    if 'clip_norm' in config:
+        clip_norm = config.pop('clip_norm')
+        grad_clip = paddle.nn.ClipGradByNorm(clip_norm=clip_norm)
+    else:
+        grad_clip = None
    optim = getattr(optimizer, optim_name)(learning_rate=lr,
                                           weight_decay=reg,
+                                           grad_clip=grad_clip,
                                           **config)
    return optim(parameters), lr
--- a/ppocr/optimizer/optimizer.py
+++ b/ppocr/optimizer/optimizer.py
@@ -30,18 +30,25 @@ class Momentum(object):
        regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
    """

-    def __init__(self, learning_rate, momentum, weight_decay=None, **args):
+    def __init__(self,
+                 learning_rate,
+                 momentum,
+                 weight_decay=None,
+                 grad_clip=None,
+                 **args):
        super(Momentum, self).__init__()
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip

    def __call__(self, parameters):
        opt = optim.Momentum(
            learning_rate=self.learning_rate,
            momentum=self.momentum,
-            parameters=parameters,
-            weight_decay=self.weight_decay)
+            weight_decay=self.weight_decay,
+            grad_clip=self.grad_clip,
+            parameters=parameters)
        return opt


@@ -96,10 +103,11 @@ class RMSProp(object):

    def __init__(self,
                 learning_rate,
-                 momentum,
+                 momentum=0.0,
                 rho=0.95,
                 epsilon=1e-6,
                 weight_decay=None,
+                 grad_clip=None,
                 **args):
        super(RMSProp, self).__init__()
        self.learning_rate = learning_rate
@@ -107,6 +115,7 @@ class RMSProp(object):
        self.rho = rho
        self.epsilon = epsilon
        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip

    def __call__(self, parameters):
        opt = optim.RMSProp(
@@ -115,5 +124,6 @@ class RMSProp(object):
            rho=self.rho,
            epsilon=self.epsilon,
            weight_decay=self.weight_decay,
+            grad_clip=self.grad_clip,
            parameters=parameters)
        return opt