From b8a593e7395b7cf6caf9d4042c6c70fe48b4aa38 Mon Sep 17 00:00:00 2001 From: Zhen Wang Date: Fri, 26 Feb 2021 14:27:59 +0800 Subject: [PATCH] Use correct master weights in AdamW. (#30895) (#31142) * Use correct master weights in AdamW. * Just modify the master weight. * Update for CI Coverage. --- .../tests/test_multi_precision_fp16_train.py | 2 +- python/paddle/optimizer/adamw.py | 13 +++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py index 15373ee7bba..b190a5d02ef 100644 --- a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py +++ b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py @@ -97,7 +97,7 @@ def train(use_pure_fp16=True, use_nesterov=False, use_adam=False): test_program = train_program.clone(for_test=True) if use_adam: - optimizer = paddle.optimizer.Adam( + optimizer = paddle.optimizer.AdamW( learning_rate=0.001, epsilon=1e-8, weight_decay=0.0, diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index cd3955d5f06..78c9fcb83fc 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -14,6 +14,7 @@ from .optimizer import Optimizer from .adam import Adam +from ..fluid import core from ..fluid import framework from ..fluid.dygraph import base as imperative_base import paddle @@ -182,8 +183,16 @@ class AdamW(Adam): decay_coeff = 1.0 - learning_rate * self._coeff self._lr_to_coeff[learning_rate] = decay_coeff - scaled_param = param * decay_coeff - paddle.fluid.layers.assign(input=scaled_param, output=param) + find_master = (self._multi_precision and + param.dtype == core.VarDesc.VarType.FP16) + if find_master: + master_weight = self._master_weights[param.name] + scaled_param = master_weight * decay_coeff + paddle.fluid.layers.assign( + input=scaled_param, output=master_weight) + else: + scaled_param = param * decay_coeff + paddle.fluid.layers.assign(input=scaled_param, output=param) def _append_optimize_op(self, block, param_and_grad): self._append_decoupled_weight_decay(block, param_and_grad) -- GitLab