diff --git a/doc/doc_ch/config.md b/doc/doc_ch/config.md
index 2cc502cadf5101c4321ca7543647dd90ea7e0466..af5b6e51553895a9dcda9012bde729ce849c2136 100644
--- a/doc/doc_ch/config.md
+++ b/doc/doc_ch/config.md
@@ -11,7 +11,7 @@
 ## 配置文件参数介绍
 
 以 `rec_chinese_lite_train_v1.1.yml ` 为例
-### Global 
+### Global
 
 |         字段             |            用途                |      默认值       |            备注            |
 | :----------------------: |  :---------------------:   | :--------------:  |   :--------------------:   |
@@ -42,6 +42,7 @@
 |      name        |         优化器类名          |  Adam  |  目前支持`Momentum`,`Adam`,`RMSProp`, 见[ppocr/optimizer/optimizer.py](../../ppocr/optimizer/optimizer.py)  |
 |      beta1           |    设置一阶矩估计的指数衰减率  |       0.9         |               \             |
 |      beta2           |    设置二阶矩估计的指数衰减率  |     0.999         |               \             |
+|      clip_norm           |    所允许的二范数最大值  |              |               \             |
 |      **lr**                |         设置学习率decay方式       |   -    |       \  |
 |        name    |      学习率decay类名   |         Cosine       | 目前支持`Linear`,`Cosine`,`Step`,`Piecewise`, 见[ppocr/optimizer/learning_rate.py](../../ppocr/optimizer/learning_rate.py) |
 |        learning_rate      |    基础学习率        |       0.001      |  \        |
@@ -119,4 +120,4 @@
 |      shuffle        |        每个epoch是否将数据集顺序打乱         |  True | \  |
 |      batch_size_per_card        |        训练时单卡batch size         |  256 | \  |
 |      drop_last        |        是否丢弃因数据集样本数不能被 batch_size 整除而产生的最后一个不完整的mini-batch        |  True | \  |
-|      num_workers        |        用于加载数据的子进程个数，若为0即为不开启子进程，在主进程中进行数据加载        |  8 | \  |
\ No newline at end of file
+|      num_workers        |        用于加载数据的子进程个数，若为0即为不开启子进程，在主进程中进行数据加载        |  8 | \  |
diff --git a/doc/doc_en/config_en.md b/doc/doc_en/config_en.md
index 574bb41b6b1735271f9c794b856c5efb32db424f..b8f638a6ec19f9803397e4689947c03c81daffe8 100644
--- a/doc/doc_en/config_en.md
+++ b/doc/doc_en/config_en.md
@@ -10,7 +10,7 @@ The following list can be viewed through `--help`
 ## INTRODUCTION TO GLOBAL PARAMETERS OF CONFIGURATION FILE
 
 Take rec_chinese_lite_train_v1.1.yml as an example
-### Global 
+### Global
 
 |         Parameter             |            Use                |      Defaults       |            Note            |
 | :----------------------: |  :---------------------:   | :--------------:  |   :--------------------:   |
@@ -41,6 +41,7 @@ Take rec_chinese_lite_train_v1.1.yml as an example
 |      name        |         Optimizer class name          |  Adam  |  Currently supports`Momentum`,`Adam`,`RMSProp`, see [ppocr/optimizer/optimizer.py](../../ppocr/optimizer/optimizer.py)  |
 |      beta1           |    Set the exponential decay rate for the 1st moment estimates  |       0.9         |               \             |
 |      beta2           |    Set the exponential decay rate for the 2nd moment estimates  |     0.999         |               \             |
+|      clip_norm           |    The maximum norm value  |    -         |               \             |
 |      **lr**                |         Set the learning rate decay method       |   -    |       \  |
 |        name    |      Learning rate decay class name   |         Cosine       | Currently supports`Linear`,`Cosine`,`Step`,`Piecewise`, see[ppocr/optimizer/learning_rate.py](../../ppocr/optimizer/learning_rate.py) |
 |        learning_rate      |    Set the base learning rate        |       0.001      |  \        |
@@ -118,4 +119,4 @@ In ppocr, the network is divided into four stages: Transform, Backbone, Neck and
 |      shuffle        |        Does each epoch disrupt the order of the data set         |  True | \  |
 |      batch_size_per_card        |        Single card batch size during training         |  256 | \  |
 |      drop_last        |        Whether to discard the last incomplete mini-batch because the number of samples in the data set cannot be divisible by batch_size        |  True | \  |
-|      num_workers        |        The number of sub-processes used to load data, if it is 0, the sub-process is not started, and the data is loaded in the main process       |  8 | \  |
\ No newline at end of file
+|      num_workers        |        The number of sub-processes used to load data, if it is 0, the sub-process is not started, and the data is loaded in the main process       |  8 | \  |
diff --git a/ppocr/optimizer/__init__.py b/ppocr/optimizer/__init__.py
index 6413ae959200c25d6c17b1ec93217c0e8b0bf269..c729103a700a59764bda4f53dd68d3958172ca57 100644
--- a/ppocr/optimizer/__init__.py
+++ b/ppocr/optimizer/__init__.py
@@ -16,8 +16,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
-
 import copy
+import paddle
 
 __all__ = ['build_optimizer']
 
@@ -49,7 +49,13 @@ def build_optimizer(config, epochs, step_each_epoch, parameters):
 
     # step3 build optimizer
     optim_name = config.pop('name')
+    if 'clip_norm' in config:
+        clip_norm = config.pop('clip_norm')
+        grad_clip = paddle.nn.ClipGradByNorm(clip_norm=clip_norm)
+    else:
+        grad_clip = None
     optim = getattr(optimizer, optim_name)(learning_rate=lr,
                                            weight_decay=reg,
+                                           grad_clip=grad_clip,
                                            **config)
     return optim(parameters), lr
diff --git a/ppocr/optimizer/optimizer.py b/ppocr/optimizer/optimizer.py
index 2519e4e309f651dbbaebecfe8533c3eb393d47cb..8215b92d8c8d05c2b3c2e95ac989bf4ea011310b 100644
--- a/ppocr/optimizer/optimizer.py
+++ b/ppocr/optimizer/optimizer.py
@@ -30,18 +30,25 @@ class Momentum(object):
         regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
     """
 
-    def __init__(self, learning_rate, momentum, weight_decay=None, **args):
+    def __init__(self,
+                 learning_rate,
+                 momentum,
+                 weight_decay=None,
+                 grad_clip=None,
+                 **args):
         super(Momentum, self).__init__()
         self.learning_rate = learning_rate
         self.momentum = momentum
         self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
 
     def __call__(self, parameters):
         opt = optim.Momentum(
             learning_rate=self.learning_rate,
             momentum=self.momentum,
-            parameters=parameters,
-            weight_decay=self.weight_decay)
+            weight_decay=self.weight_decay,
+            grad_clip=self.grad_clip,
+            parameters=parameters)
         return opt
 
 
@@ -96,10 +103,11 @@ class RMSProp(object):
 
     def __init__(self,
                  learning_rate,
-                 momentum,
+                 momentum=0.0,
                  rho=0.95,
                  epsilon=1e-6,
                  weight_decay=None,
+                 grad_clip=None,
                  **args):
         super(RMSProp, self).__init__()
         self.learning_rate = learning_rate
@@ -107,6 +115,7 @@ class RMSProp(object):
         self.rho = rho
         self.epsilon = epsilon
         self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
 
     def __call__(self, parameters):
         opt = optim.RMSProp(
@@ -115,5 +124,6 @@ class RMSProp(object):
             rho=self.rho,
             epsilon=self.epsilon,
             weight_decay=self.weight_decay,
+            grad_clip=self.grad_clip,
             parameters=parameters)
         return opt