未验证 提交 6ab0a6a8 编写于 作者: W WangXi 提交者: GitHub

[hybrid parallel] pipeline support adamw and LRScheduler (#34402)

上级 ede001f9
......@@ -1664,6 +1664,16 @@ class Executor(object):
print_period, fetch_handler,
use_program_cache)
from paddle.optimizer.lr import LRScheduler
if hasattr(program, 'lr_sheduler'):
lr_sheduler = program.lr_sheduler
assert isinstance(lr_sheduler, LRScheduler), "must be LRScheduler"
lr_value = lr_sheduler()
lr_var = program.global_block().vars[lr_sheduler._var_name]
data = np.array([lr_value]).astype(convert_dtype(lr_var.dtype))
tensor = core.get_variable_tensor(scope, lr_sheduler._var_name)
tensor.set(data, self.place)
self._default_executor.run_from_dataset(trainer_instance)
if not use_program_cache:
......
......@@ -4634,6 +4634,9 @@ class PipelineOptimizer(object):
op.type == 'elementwise_div'):
device = f"{self._device}:all"
op._set_attr(self._op_device_key, device)
elif self._is_weight_decay_op(op) and op.type == 'scale':
# set AdamW decay_coeff to device:all
op._set_attr(self._op_device_key, f"{self._device}:all")
elif op.type == "alloc_float_status":
op._set_attr(self._op_device_key, f"{self._device}:all")
else:
......@@ -5267,6 +5270,11 @@ class PipelineOptimizer(object):
return op.desc.has_attr("op_namescope") \
and op.desc.attr("op_namescope").startswith("/regularization")
def _is_weight_decay_op(self, op):
# in AdamW namescope is /optimizer_*/weight decay/
return op.desc.has_attr("op_namescope") \
and 'weight decay' in op.desc.attr("op_namescope")
def _get_input_output_info(self, block):
'''
Get info of op input and output.
......
......@@ -116,10 +116,10 @@ class TestDistMnist2x2(TestDistRunnerBase):
steps_per_pass = 10
bd = [steps_per_pass * p for p in passes]
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
opt = fluid.optimizer.Momentum(
lr_val = paddle.optimizer.lr.PiecewiseDecay(boundaries=bd, values=lr)
opt = paddle.optimizer.AdamW(
learning_rate=lr_val,
momentum=0.9,
grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
acc_steps = 2 # accumulated steps for pipeline
......
......@@ -96,6 +96,15 @@ class TestDistRunnerBase(object):
current_endpoint=current_endpoint)
return t
@staticmethod
def get_lr_scheduler(program):
lr_sheduler = None
if hasattr(program, 'lr_sheduler'):
from paddle.optimizer.lr import LRScheduler
lr_sheduler = program.lr_sheduler
assert isinstance(lr_sheduler, LRScheduler), "must be LRScheduler"
return lr_sheduler
def run_pserver(self, args):
self.lr = args.lr
self.get_model(batch_size=args.batch_size)
......@@ -139,11 +148,17 @@ class TestDistRunnerBase(object):
data_loader.start()
print_to_err(type(self).__name__, "begin to train on trainer")
out_losses = []
main_program = fluid.default_main_program()
lr_sheduler = self.get_lr_scheduler(main_program)
for i in six.moves.xrange(RUN_STEP):
loss = exe.run(fluid.default_main_program(), fetch_list=[avg_cost])
loss = exe.run(main_program, fetch_list=[avg_cost])
loss = loss[0] if loss else None
out_losses.append(loss)
print_to_err(type(self).__name__, "run step %d finished" % i)
if lr_sheduler is not None:
lr_sheduler.step()
data_loader.reset()
print_to_err(type(self).__name__, "trainer run finished")
......@@ -494,6 +509,7 @@ class TestDistRunnerBase(object):
else:
return origin_batch
lr_scheduler = self.get_lr_scheduler(trainer_prog)
print_to_err(type(self).__name__, "begin to train on trainer")
out_losses = []
for i in six.moves.xrange(RUN_STEP):
......@@ -502,6 +518,9 @@ class TestDistRunnerBase(object):
feed=feeder.feed(get_data()))
out_losses.append(loss[0])
print_to_err(type(self).__name__, "run step %d finished" % i)
if lr_scheduler is not None:
lr_scheduler.step()
print_to_err(type(self).__name__, "trainer run finished")
print_to_out(out_losses)
......
......@@ -160,6 +160,7 @@ class AdamW(Adam):
self._apply_decay_param_fun = apply_decay_param_fun
self._coeff = coeff
self._lr_to_coeff = dict()
super(AdamW, self).__init__(
learning_rate=learning_rate,
parameters=parameters,
......@@ -211,7 +212,9 @@ class AdamW(Adam):
# we do this in _create_optimization_pass
decay_coeff = self._lr_to_coeff.get(learning_rate, None)
if decay_coeff is None:
decay_coeff = 1.0 - learning_rate * self._coeff
# NOTE(wangxi): for pipeline to set device:all
with paddle.static.device_guard(None):
decay_coeff = 1.0 - learning_rate * self._coeff
self._lr_to_coeff[learning_rate] = decay_coeff
find_master = (self._multi_precision and
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册