diff --git a/examples/seq2seq/README.md b/examples/seq2seq/README.md new file mode 100644 index 0000000000000000000000000000000000000000..808d4516c57593210988542150b74c671e41d5da --- /dev/null +++ b/examples/seq2seq/README.md @@ -0,0 +1,165 @@ +运行本目录下的范例模型需要安装PaddlePaddle Fluid 1.7版。如果您的 PaddlePaddle 安装版本低于此要求,请按照[安装文档](https://www.paddlepaddle.org.cn/#quick-start)中的说明更新 PaddlePaddle 安装版本。 + +# Sequence to Sequence (Seq2Seq) + +以下是本范例模型的简要目录结构及说明: + +``` +. +├── README.md # 文档,本文件 +├── args.py # 训练、预测以及模型参数配置程序 +├── reader.py # 数据读入程序 +├── download.py # 数据下载程序 +├── train.py # 训练主程序 +├── predict.py # 预测主程序 +├── seq2seq_attn.py # 带注意力机制的翻译模型程序 +└── seq2seq_base.py # 无注意力机制的翻译模型程序 +``` + +## 简介 + +Sequence to Sequence (Seq2Seq),使用编码器-解码器(Encoder-Decoder)结构,用编码器将源序列编码成vector,再用解码器将该vector解码为目标序列。Seq2Seq 广泛应用于机器翻译,自动对话机器人,文档摘要自动生成,图片描述自动生成等任务中。 + +本目录包含Seq2Seq的一个经典样例:机器翻译,实现了一个base model(不带attention机制),一个带attention机制的翻译模型。Seq2Seq翻译模型,模拟了人类在进行翻译类任务时的行为:先解析源语言,理解其含义,再根据该含义来写出目标语言的语句。更多关于机器翻译的具体原理和数学表达式,我们推荐参考飞桨官网[机器翻译案例](https://www.paddlepaddle.org.cn/documentation/docs/zh/user_guides/nlp_case/machine_translation/README.cn.html)。 + +## 模型概览 + +本模型中,在编码器方面,我们采用了基于LSTM的多层的RNN encoder;在解码器方面,我们使用了带注意力(Attention)机制的RNN decoder,并同时提供了一个不带注意力机制的解码器实现作为对比。在预测时我们使用柱搜索(beam search)算法来生成翻译的目标语句。 + +## 数据介绍 + +本教程使用[IWSLT'15 English-Vietnamese data ](https://nlp.stanford.edu/projects/nmt/)数据集中的英语到越南语的数据作为训练语料,tst2012的数据作为开发集,tst2013的数据作为测试集 + +### 数据获取 + +``` +python download.py +``` + +## 模型训练 + +执行以下命令即可训练带有注意力机制的Seq2Seq机器翻译模型: + +```sh +export CUDA_VISIBLE_DEVICES=0 + +python train.py \ + --src_lang en --tar_lang vi \ + --attention True \ + --num_layers 2 \ + --hidden_size 512 \ + --src_vocab_size 17191 \ + --tar_vocab_size 7709 \ + --batch_size 128 \ + --dropout 0.2 \ + --init_scale 0.1 \ + --max_grad_norm 5.0 \ + --train_data_prefix data/en-vi/train \ + --eval_data_prefix data/en-vi/tst2012 \ + --test_data_prefix data/en-vi/tst2013 \ + --vocab_prefix data/en-vi/vocab \ + --use_gpu True \ + --model_path ./attention_models +``` + +可以通过修改 `attention` 参数为False来训练不带注意力机制的Seq2Seq模型,各参数的具体说明请参阅 `args.py` 。训练程序会在每个epoch训练结束之后,save一次模型。 + +默认使用动态图模式进行训练,可以通过设置 `eager_run` 参数为False来以静态图模式进行训练,如下: + +```sh +export CUDA_VISIBLE_DEVICES=0 + +python train.py \ + --src_lang en --tar_lang vi \ + --attention True \ + --num_layers 2 \ + --hidden_size 512 \ + --src_vocab_size 17191 \ + --tar_vocab_size 7709 \ + --batch_size 128 \ + --dropout 0.2 \ + --init_scale 0.1 \ + --max_grad_norm 5.0 \ + --train_data_prefix data/en-vi/train \ + --eval_data_prefix data/en-vi/tst2012 \ + --test_data_prefix data/en-vi/tst2013 \ + --vocab_prefix data/en-vi/vocab \ + --use_gpu True \ + --model_path ./attention_models \ + --eager_run False +``` + +## 模型预测 + +训练完成之后,可以使用保存的模型(由 `--reload_model` 指定)对test的数据集(由 `--infer_file` 指定)进行beam search解码,命令如下: + +```sh +export CUDA_VISIBLE_DEVICES=0 + +python infer.py \ + --attention True \ + --src_lang en --tar_lang vi \ + --num_layers 2 \ + --hidden_size 512 \ + --src_vocab_size 17191 \ + --tar_vocab_size 7709 \ + --batch_size 128 \ + --dropout 0.2 \ + --init_scale 0.1 \ + --max_grad_norm 5.0 \ + --vocab_prefix data/en-vi/vocab \ + --infer_file data/en-vi/tst2013.en \ + --reload_model attention_models/10 \ + --infer_output_file infer_output.txt \ + --beam_size 10 \ + --use_gpu True +``` + +各参数的具体说明请参阅 `args.py` ,注意预测时所用模型超参数需和训练时一致。和训练类似,预测时同样可以以静态图模式进行,如下: + +```sh +export CUDA_VISIBLE_DEVICES=0 + +python infer.py \ + --attention True \ + --src_lang en --tar_lang vi \ + --num_layers 2 \ + --hidden_size 512 \ + --src_vocab_size 17191 \ + --tar_vocab_size 7709 \ + --batch_size 128 \ + --dropout 0.2 \ + --init_scale 0.1 \ + --max_grad_norm 5.0 \ + --vocab_prefix data/en-vi/vocab \ + --infer_file data/en-vi/tst2013.en \ + --reload_model attention_models/10 \ + --infer_output_file infer_output.txt \ + --beam_size 10 \ + --use_gpu True \ + --eager_run False +``` + +## 效果评价 + +使用 [*multi-bleu.perl*](https://github.com/moses-smt/mosesdecoder.git) 工具来评价模型预测的翻译质量,使用方法如下: + +```sh +mosesdecoder/scripts/generic/multi-bleu.perl tst2013.vi < infer_output.txt +``` + +每个模型分别训练了10次,单次取第10个epoch保存的模型进行预测,取beam_size=10。效果如下(为了便于观察,对10次结果按照升序进行了排序): + +``` +> no attention +tst2012 BLEU: +[10.75 10.85 10.9 10.94 10.97 11.01 11.01 11.04 11.13 11.4] +tst2013 BLEU: +[10.71 10.71 10.74 10.76 10.91 10.94 11.02 11.16 11.21 11.44] + +> with attention +tst2012 BLEU: +[21.14 22.34 22.54 22.65 22.71 22.71 23.08 23.15 23.3 23.4] +tst2013 BLEU: +[23.41 24.79 25.11 25.12 25.19 25.24 25.39 25.61 25.61 25.63] +``` diff --git a/examples/seq2seq/args.py b/examples/seq2seq/args.py new file mode 100644 index 0000000000000000000000000000000000000000..94b07cd2cfed8a87c6b9bc9b95168ad7f0f6b924 --- /dev/null +++ b/examples/seq2seq/args.py @@ -0,0 +1,140 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import distutils.util + + +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--train_data_prefix", type=str, help="file prefix for train data") + parser.add_argument( + "--eval_data_prefix", type=str, help="file prefix for eval data") + parser.add_argument( + "--test_data_prefix", type=str, help="file prefix for test data") + parser.add_argument( + "--vocab_prefix", type=str, help="file prefix for vocab") + parser.add_argument("--src_lang", type=str, help="source language suffix") + parser.add_argument("--tar_lang", type=str, help="target language suffix") + + parser.add_argument( + "--attention", + type=eval, + default=False, + help="Whether use attention model") + + parser.add_argument( + "--optimizer", + type=str, + default='adam', + help="optimizer to use, only supprt[sgd|adam]") + + parser.add_argument( + "--learning_rate", + type=float, + default=0.001, + help="learning rate for optimizer") + + parser.add_argument( + "--num_layers", + type=int, + default=1, + help="layers number of encoder and decoder") + parser.add_argument( + "--hidden_size", + type=int, + default=100, + help="hidden size of encoder and decoder") + parser.add_argument("--src_vocab_size", type=int, help="source vocab size") + parser.add_argument("--tar_vocab_size", type=int, help="target vocab size") + + parser.add_argument( + "--batch_size", type=int, help="batch size of each step") + + parser.add_argument( + "--max_epoch", type=int, default=12, help="max epoch for the training") + + parser.add_argument( + "--max_len", + type=int, + default=50, + help="max length for source and target sentence") + parser.add_argument( + "--dropout", type=float, default=0.0, help="drop probability") + parser.add_argument( + "--init_scale", + type=float, + default=0.0, + help="init scale for parameter") + parser.add_argument( + "--max_grad_norm", + type=float, + default=5.0, + help="max grad norm for global norm clip") + + parser.add_argument( + "--log_freq", + type=int, + default=100, + help="The frequency to print training logs") + + parser.add_argument( + "--model_path", + type=str, + default='model', + help="model path for model to save") + + parser.add_argument( + "--reload_model", type=str, help="reload model to inference") + + parser.add_argument( + "--infer_file", type=str, help="file name for inference") + parser.add_argument( + "--infer_output_file", + type=str, + default='infer_output', + help="file name for inference output") + parser.add_argument( + "--beam_size", type=int, default=10, help="file name for inference") + + parser.add_argument( + '--use_gpu', + type=eval, + default=False, + help='Whether using gpu [True|False]') + + parser.add_argument( + '--eager_run', type=eval, default=False, help='Whether to use dygraph') + + parser.add_argument( + "--enable_ce", + action='store_true', + help="The flag indicating whether to run the task " + "for continuous evaluation.") + + parser.add_argument( + "--profile", action='store_true', help="Whether enable the profile.") + # NOTE: profiler args, used for benchmark + parser.add_argument( + "--profiler_path", + type=str, + default='./seq2seq.profile', + help="the profiler output file path. (used for benchmark)") + args = parser.parse_args() + return args diff --git a/examples/seq2seq/download.py b/examples/seq2seq/download.py new file mode 100644 index 0000000000000000000000000000000000000000..6d2981f452f832fb70e6962d0f5aec7db967c2d9 --- /dev/null +++ b/examples/seq2seq/download.py @@ -0,0 +1,54 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +''' +Script for downloading training data. +''' +import os +import urllib +import sys + +if sys.version_info >= (3, 0): + import urllib.request +import zipfile + +URLLIB = urllib +if sys.version_info >= (3, 0): + URLLIB = urllib.request + +remote_path = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi' +base_path = 'data' +tar_path = os.path.join(base_path, 'en-vi') +filenames = [ + 'train.en', 'train.vi', 'tst2012.en', 'tst2012.vi', 'tst2013.en', + 'tst2013.vi', 'vocab.en', 'vocab.vi' +] + + +def main(arguments): + print("Downloading data......") + + if not os.path.exists(tar_path): + if not os.path.exists(base_path): + os.mkdir(base_path) + os.mkdir(tar_path) + + for filename in filenames: + url = remote_path + '/' + filename + tar_file = os.path.join(tar_path, filename) + URLLIB.urlretrieve(url, tar_file) + print("Downloaded sucess......") + + +if __name__ == '__main__': + sys.exit(main(sys.argv[1:])) diff --git a/examples/seq2seq/predict.py b/examples/seq2seq/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..d1e3e87fddf05d453ed984a49d42fcac0f833cab --- /dev/null +++ b/examples/seq2seq/predict.py @@ -0,0 +1,126 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import io +import random +from functools import partial + +import numpy as np +import paddle.fluid as fluid +from paddle.fluid.layers.utils import flatten +from paddle.fluid.io import DataLoader + +from hapi.model import Input, set_device +from args import parse_args +from seq2seq_base import BaseInferModel +from seq2seq_attn import AttentionInferModel +from reader import Seq2SeqDataset, Seq2SeqBatchSampler, SortType, prepare_infer_input + + +def post_process_seq(seq, bos_idx, eos_idx, output_bos=False, + output_eos=False): + """ + Post-process the decoded sequence. + """ + eos_pos = len(seq) - 1 + for i, idx in enumerate(seq): + if idx == eos_idx: + eos_pos = i + break + seq = [ + idx for idx in seq[:eos_pos + 1] + if (output_bos or idx != bos_idx) and (output_eos or idx != eos_idx) + ] + return seq + + +def do_predict(args): + device = set_device("gpu" if args.use_gpu else "cpu") + fluid.enable_dygraph(device) if args.eager_run else None + + # define model + inputs = [ + Input( + [None, None], "int64", name="src_word"), + Input( + [None], "int64", name="src_length"), + ] + + # def dataloader + dataset = Seq2SeqDataset( + fpattern=args.infer_file, + src_vocab_fpath=args.vocab_prefix + "." + args.src_lang, + trg_vocab_fpath=args.vocab_prefix + "." + args.tar_lang, + token_delimiter=None, + start_mark="", + end_mark="", + unk_mark="") + trg_idx2word = Seq2SeqDataset.load_dict( + dict_path=args.vocab_prefix + "." + args.tar_lang, reverse=True) + (args.src_vocab_size, args.trg_vocab_size, bos_id, eos_id, + unk_id) = dataset.get_vocab_summary() + batch_sampler = Seq2SeqBatchSampler( + dataset=dataset, use_token_batch=False, batch_size=args.batch_size) + data_loader = DataLoader( + dataset=dataset, + batch_sampler=batch_sampler, + places=device, + feed_list=None + if fluid.in_dygraph_mode() else [x.forward() for x in inputs], + collate_fn=partial( + prepare_infer_input, bos_id=bos_id, eos_id=eos_id, pad_id=eos_id), + num_workers=0, + return_list=True) + + model_maker = AttentionInferModel if args.attention else BaseInferModel + model = model_maker( + args.src_vocab_size, + args.tar_vocab_size, + args.hidden_size, + args.hidden_size, + args.num_layers, + args.dropout, + bos_id=bos_id, + eos_id=eos_id, + beam_size=args.beam_size, + max_out_len=256) + + model.prepare(inputs=inputs) + + # load the trained model + assert args.reload_model, ( + "Please set reload_model to load the infer model.") + model.load(args.reload_model) + + # TODO(guosheng): use model.predict when support variant length + with io.open(args.infer_output_file, 'w', encoding='utf-8') as f: + for data in data_loader(): + finished_seq = model.test_batch(inputs=flatten(data))[0] + finished_seq = finished_seq[:, :, np.newaxis] if len( + finished_seq.shape) == 2 else finished_seq + finished_seq = np.transpose(finished_seq, [0, 2, 1]) + for ins in finished_seq: + for beam_idx, beam in enumerate(ins): + id_list = post_process_seq(beam, bos_id, eos_id) + word_list = [trg_idx2word[id] for id in id_list] + sequence = " ".join(word_list) + "\n" + f.write(sequence) + break + + +if __name__ == "__main__": + args = parse_args() + do_predict(args) diff --git a/examples/seq2seq/reader.py b/examples/seq2seq/reader.py new file mode 100644 index 0000000000000000000000000000000000000000..afa88a81058e6304af7e7cbd74a839fb2e46bd41 --- /dev/null +++ b/examples/seq2seq/reader.py @@ -0,0 +1,437 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import glob +import six +import os +import io +import itertools +from functools import partial + +import numpy as np +import paddle.fluid as fluid +from paddle.fluid.dygraph.parallel import ParallelEnv +from paddle.fluid.io import BatchSampler, DataLoader, Dataset + + +def create_data_loader(args, device, for_train=True): + data_loaders = [None, None] + data_prefixes = [args.train_data_prefix, args.eval_data_prefix + ] if args.eval_data_prefix else [args.train_data_prefix] + for i, data_prefix in enumerate(data_prefixes): + dataset = Seq2SeqDataset( + fpattern=data_prefix + "." + args.src_lang, + trg_fpattern=data_prefix + "." + args.tar_lang, + src_vocab_fpath=args.vocab_prefix + "." + args.src_lang, + trg_vocab_fpath=args.vocab_prefix + "." + args.tar_lang, + token_delimiter=None, + start_mark="", + end_mark="", + unk_mark="", + max_length=args.max_len if i == 0 else None, + truncate=True, + trg_add_bos_eos=True) + (args.src_vocab_size, args.tar_vocab_size, bos_id, eos_id, + unk_id) = dataset.get_vocab_summary() + batch_sampler = Seq2SeqBatchSampler( + dataset=dataset, + use_token_batch=False, + batch_size=args.batch_size, + pool_size=args.batch_size * 20, + sort_type=SortType.POOL, + shuffle=False if args.enable_ce else True, + distribute_mode=True if i == 0 else False) + data_loader = DataLoader( + dataset=dataset, + batch_sampler=batch_sampler, + places=device, + collate_fn=partial( + prepare_train_input, + bos_id=bos_id, + eos_id=eos_id, + pad_id=eos_id), + num_workers=0, + return_list=True) + data_loaders[i] = data_loader + return data_loaders + + +def prepare_train_input(insts, bos_id, eos_id, pad_id): + src, src_length = pad_batch_data( + [inst[0] for inst in insts], pad_id=pad_id) + trg, trg_length = pad_batch_data( + [inst[1] for inst in insts], pad_id=pad_id) + trg_length = trg_length - 1 + return src, src_length, trg[:, :-1], trg_length, trg[:, 1:, np.newaxis] + + +def prepare_infer_input(insts, bos_id, eos_id, pad_id): + src, src_length = pad_batch_data(insts, pad_id=pad_id) + return src, src_length + + +def pad_batch_data(insts, pad_id): + """ + Pad the instances to the max sequence length in batch, and generate the + corresponding position data and attention bias. + """ + inst_lens = np.array([len(inst) for inst in insts], dtype="int64") + max_len = np.max(inst_lens) + inst_data = np.array( + [inst + [pad_id] * (max_len - len(inst)) for inst in insts], + dtype="int64") + return inst_data, inst_lens + + +class SortType(object): + GLOBAL = 'global' + POOL = 'pool' + NONE = "none" + + +class Converter(object): + def __init__(self, vocab, beg, end, unk, delimiter, add_beg, add_end): + self._vocab = vocab + self._beg = beg + self._end = end + self._unk = unk + self._delimiter = delimiter + self._add_beg = add_beg + self._add_end = add_end + + def __call__(self, sentence): + return ([self._beg] if self._add_beg else []) + [ + self._vocab.get(w, self._unk) + for w in sentence.split(self._delimiter) + ] + ([self._end] if self._add_end else []) + + +class ComposedConverter(object): + def __init__(self, converters): + self._converters = converters + + def __call__(self, fields): + return [ + converter(field) + for field, converter in zip(fields, self._converters) + ] + + +class SentenceBatchCreator(object): + def __init__(self, batch_size): + self.batch = [] + self._batch_size = batch_size + + def append(self, info): + self.batch.append(info) + if len(self.batch) == self._batch_size: + tmp = self.batch + self.batch = [] + return tmp + + +class TokenBatchCreator(object): + def __init__(self, batch_size): + self.batch = [] + self.max_len = -1 + self._batch_size = batch_size + + def append(self, info): + cur_len = info.max_len + max_len = max(self.max_len, cur_len) + if max_len * (len(self.batch) + 1) > self._batch_size: + result = self.batch + self.batch = [info] + self.max_len = cur_len + return result + else: + self.max_len = max_len + self.batch.append(info) + + +class SampleInfo(object): + def __init__(self, i, lens): + self.i = i + self.lens = lens + self.max_len = lens[0] # to be consitent with the original reader + + def get_ranges(self, min_length=None, max_length=None, truncate=False): + ranges = [] + # source + if (min_length is None or self.lens[0] >= min_length) and ( + max_length is None or self.lens[0] <= max_length or truncate): + end = max_length if truncate and max_length else self.lens[0] + ranges.append([0, end]) + # target + if len(self.lens) == 2: + if (min_length is None or self.lens[1] >= min_length) and ( + max_length is None or self.lens[1] <= max_length + 2 or + truncate): + end = max_length + 2 if truncate and max_length else self.lens[ + 1] + ranges.append([0, end]) + return ranges if len(ranges) == len(self.lens) else None + + +class MinMaxFilter(object): + def __init__(self, max_len, min_len, underlying_creator): + self._min_len = min_len + self._max_len = max_len + self._creator = underlying_creator + + def append(self, info): + if (self._min_len is None or info.min_len >= self._min_len) and ( + self._max_len is None or info.max_len <= self._max_len): + return self._creator.append(info) + + @property + def batch(self): + return self._creator.batch + + +class Seq2SeqDataset(Dataset): + def __init__(self, + src_vocab_fpath, + trg_vocab_fpath, + fpattern, + field_delimiter="\t", + token_delimiter=" ", + start_mark="", + end_mark="", + unk_mark="", + trg_fpattern=None, + trg_add_bos_eos=False, + byte_data=False, + min_length=None, + max_length=None, + truncate=False): + if byte_data: + # The WMT16 bpe data used here seems including bytes can not be + # decoded by utf8. Thus convert str to bytes, and use byte data + field_delimiter = field_delimiter.encode("utf8") + token_delimiter = token_delimiter.encode("utf8") + start_mark = start_mark.encode("utf8") + end_mark = end_mark.encode("utf8") + unk_mark = unk_mark.encode("utf8") + self._byte_data = byte_data + self._src_vocab = self.load_dict(src_vocab_fpath, byte_data=byte_data) + self._trg_vocab = self.load_dict(trg_vocab_fpath, byte_data=byte_data) + self._bos_idx = self._src_vocab[start_mark] + self._eos_idx = self._src_vocab[end_mark] + self._unk_idx = self._src_vocab[unk_mark] + self._field_delimiter = field_delimiter + self._token_delimiter = token_delimiter + self._min_length = min_length + self._max_length = max_length + self._truncate = truncate + self._trg_add_bos_eos = trg_add_bos_eos + self.load_src_trg_ids(fpattern, trg_fpattern) + + def load_src_trg_ids(self, fpattern, trg_fpattern=None): + src_converter = Converter( + vocab=self._src_vocab, + beg=self._bos_idx, + end=self._eos_idx, + unk=self._unk_idx, + delimiter=self._token_delimiter, + add_beg=False, + add_end=False) + + trg_converter = Converter( + vocab=self._trg_vocab, + beg=self._bos_idx, + end=self._eos_idx, + unk=self._unk_idx, + delimiter=self._token_delimiter, + add_beg=True if self._trg_add_bos_eos else False, + add_end=True if self._trg_add_bos_eos else False) + + converters = ComposedConverter([src_converter, trg_converter]) + + self._src_seq_ids = [] + self._trg_seq_ids = [] + self._sample_infos = [] + + slots = [self._src_seq_ids, self._trg_seq_ids] + for i, line in enumerate(self._load_lines(fpattern, trg_fpattern)): + fields = converters(line) + lens = [len(field) for field in fields] + sample = SampleInfo(i, lens) + field_ranges = sample.get_ranges(self._min_length, + self._max_length, self._truncate) + if field_ranges: + for field, field_range, slot in zip(fields, field_ranges, + slots): + slot.append(field[field_range[0]:field_range[1]]) + self._sample_infos.append(sample) + + def _load_lines(self, fpattern, trg_fpattern=None): + fpaths = glob.glob(fpattern) + fpaths = sorted(fpaths) # TODO: Add custum sort + assert len(fpaths) > 0, "no matching file to the provided data path" + + (f_mode, f_encoding, + endl) = ("rb", None, b"\n") if self._byte_data else ("r", "utf8", + "\n") + if trg_fpattern is None: + for fpath in fpaths: + with io.open(fpath, f_mode, encoding=f_encoding) as f: + for line in f: + fields = line.strip(endl).split(self._field_delimiter) + yield fields + else: + # separated source and target language data files + # assume we can get aligned data by sort the two language files + # TODO: Need more rigorous check + trg_fpaths = glob.glob(trg_fpattern) + trg_fpaths = sorted(trg_fpaths) + assert len(fpaths) == len( + trg_fpaths + ), "the number of source language data files must equal \ + with that of source language" + + for fpath, trg_fpath in zip(fpaths, trg_fpaths): + with io.open(fpath, f_mode, encoding=f_encoding) as f: + with io.open( + trg_fpath, f_mode, encoding=f_encoding) as trg_f: + for line in zip(f, trg_f): + fields = [field.strip(endl) for field in line] + yield fields + + @staticmethod + def load_dict(dict_path, reverse=False, byte_data=False): + word_dict = {} + (f_mode, f_encoding, + endl) = ("rb", None, b"\n") if byte_data else ("r", "utf8", "\n") + with io.open(dict_path, f_mode, encoding=f_encoding) as fdict: + for idx, line in enumerate(fdict): + if reverse: + word_dict[idx] = line.strip(endl) + else: + word_dict[line.strip(endl)] = idx + return word_dict + + def get_vocab_summary(self): + return len(self._src_vocab), len( + self._trg_vocab), self._bos_idx, self._eos_idx, self._unk_idx + + def __getitem__(self, idx): + return (self._src_seq_ids[idx], self._trg_seq_ids[idx] + ) if self._trg_seq_ids else self._src_seq_ids[idx] + + def __len__(self): + return len(self._sample_infos) + + +class Seq2SeqBatchSampler(BatchSampler): + def __init__(self, + dataset, + batch_size, + pool_size=10000, + sort_type=SortType.NONE, + min_length=None, + max_length=None, + shuffle=False, + shuffle_batch=False, + use_token_batch=False, + clip_last_batch=False, + distribute_mode=True, + seed=0): + for arg, value in locals().items(): + if arg != "self": + setattr(self, "_" + arg, value) + self._random = np.random + self._random.seed(seed) + # for multi-devices + self._distribute_mode = distribute_mode + self._nranks = ParallelEnv().nranks + self._local_rank = ParallelEnv().local_rank + self._device_id = ParallelEnv().dev_id + + def __iter__(self): + # global sort or global shuffle + if self._sort_type == SortType.GLOBAL: + infos = sorted( + self._dataset._sample_infos, key=lambda x: x.max_len) + else: + if self._shuffle: + infos = self._dataset._sample_infos + self._random.shuffle(infos) + else: + infos = self._dataset._sample_infos + + if self._sort_type == SortType.POOL: + reverse = True + for i in range(0, len(infos), self._pool_size): + # to avoid placing short next to long sentences + reverse = False # not reverse + infos[i:i + self._pool_size] = sorted( + infos[i:i + self._pool_size], + key=lambda x: x.max_len, + reverse=reverse) + + batches = [] + batch_creator = TokenBatchCreator( + self. + _batch_size) if self._use_token_batch else SentenceBatchCreator( + self._batch_size * self._nranks) + batch_creator = MinMaxFilter(self._max_length, self._min_length, + batch_creator) + + for info in infos: + batch = batch_creator.append(info) + if batch is not None: + batches.append(batch) + + if not self._clip_last_batch and len(batch_creator.batch) != 0: + batches.append(batch_creator.batch) + + if self._shuffle_batch: + self._random.shuffle(batches) + + if not self._use_token_batch: + # when producing batches according to sequence number, to confirm + # neighbor batches which would be feed and run parallel have similar + # length (thus similar computational cost) after shuffle, we as take + # them as a whole when shuffling and split here + batches = [[ + batch[self._batch_size * i:self._batch_size * (i + 1)] + for i in range(self._nranks) + ] for batch in batches] + batches = list(itertools.chain.from_iterable(batches)) + + # for multi-device + for batch_id, batch in enumerate(batches): + if not self._distribute_mode or ( + batch_id % self._nranks == self._local_rank): + batch_indices = [info.i for info in batch] + yield batch_indices + if self._distribute_mode and len(batches) % self._nranks != 0: + if self._local_rank >= len(batches) % self._nranks: + # use previous data to pad + yield batch_indices + + def __len__(self): + if not self._use_token_batch: + batch_number = ( + len(self._dataset) + self._batch_size * self._nranks - 1) // ( + self._batch_size * self._nranks) + else: + # TODO(guosheng): fix the uncertain length + batch_number = 1 + return batch_number diff --git a/examples/seq2seq/seq2seq_attn.py b/examples/seq2seq/seq2seq_attn.py new file mode 100644 index 0000000000000000000000000000000000000000..ce9cc089ca2133549fbdd08ed600e69d4235e08c --- /dev/null +++ b/examples/seq2seq/seq2seq_attn.py @@ -0,0 +1,241 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +import paddle.fluid.layers as layers +from paddle.fluid import ParamAttr +from paddle.fluid.initializer import UniformInitializer +from paddle.fluid.dygraph import Embedding, Linear, Layer +from paddle.fluid.layers import BeamSearchDecoder + +from hapi.model import Model, Loss +from hapi.text import DynamicDecode, RNN, BasicLSTMCell, RNNCell + +from seq2seq_base import Encoder + + +class AttentionLayer(Layer): + def __init__(self, hidden_size, bias=False, init_scale=0.1): + super(AttentionLayer, self).__init__() + self.input_proj = Linear( + hidden_size, + hidden_size, + param_attr=ParamAttr(initializer=UniformInitializer( + low=-init_scale, high=init_scale)), + bias_attr=bias) + self.output_proj = Linear( + hidden_size + hidden_size, + hidden_size, + param_attr=ParamAttr(initializer=UniformInitializer( + low=-init_scale, high=init_scale)), + bias_attr=bias) + + def forward(self, hidden, encoder_output, encoder_padding_mask): + # query = self.input_proj(hidden) + encoder_output = self.input_proj(encoder_output) + attn_scores = layers.matmul( + layers.unsqueeze(hidden, [1]), encoder_output, transpose_y=True) + if encoder_padding_mask is not None: + attn_scores = layers.elementwise_add(attn_scores, + encoder_padding_mask) + attn_scores = layers.softmax(attn_scores) + attn_out = layers.squeeze( + layers.matmul(attn_scores, encoder_output), [1]) + attn_out = layers.concat([attn_out, hidden], 1) + attn_out = self.output_proj(attn_out) + return attn_out + + +class DecoderCell(RNNCell): + def __init__(self, + num_layers, + input_size, + hidden_size, + dropout_prob=0., + init_scale=0.1): + super(DecoderCell, self).__init__() + self.dropout_prob = dropout_prob + # use add_sublayer to add multi-layers + self.lstm_cells = [] + for i in range(num_layers): + self.lstm_cells.append( + self.add_sublayer( + "lstm_%d" % i, + BasicLSTMCell( + input_size=input_size + hidden_size + if i == 0 else hidden_size, + hidden_size=hidden_size, + param_attr=ParamAttr(initializer=UniformInitializer( + low=-init_scale, high=init_scale))))) + self.attention_layer = AttentionLayer(hidden_size) + + def forward(self, + step_input, + states, + encoder_output, + encoder_padding_mask=None): + lstm_states, input_feed = states + new_lstm_states = [] + step_input = layers.concat([step_input, input_feed], 1) + for i, lstm_cell in enumerate(self.lstm_cells): + out, new_lstm_state = lstm_cell(step_input, lstm_states[i]) + step_input = layers.dropout( + out, + self.dropout_prob, + dropout_implementation='upscale_in_train' + ) if self.dropout_prob > 0 else out + new_lstm_states.append(new_lstm_state) + out = self.attention_layer(step_input, encoder_output, + encoder_padding_mask) + return out, [new_lstm_states, out] + + +class Decoder(Layer): + def __init__(self, + vocab_size, + embed_dim, + hidden_size, + num_layers, + dropout_prob=0., + init_scale=0.1): + super(Decoder, self).__init__() + self.embedder = Embedding( + size=[vocab_size, embed_dim], + param_attr=ParamAttr(initializer=UniformInitializer( + low=-init_scale, high=init_scale))) + self.lstm_attention = RNN(DecoderCell( + num_layers, embed_dim, hidden_size, dropout_prob, init_scale), + is_reverse=False, + time_major=False) + self.output_layer = Linear( + hidden_size, + vocab_size, + param_attr=ParamAttr(initializer=UniformInitializer( + low=-init_scale, high=init_scale)), + bias_attr=False) + + def forward(self, target, decoder_initial_states, encoder_output, + encoder_padding_mask): + inputs = self.embedder(target) + decoder_output, _ = self.lstm_attention( + inputs, + initial_states=decoder_initial_states, + encoder_output=encoder_output, + encoder_padding_mask=encoder_padding_mask) + predict = self.output_layer(decoder_output) + return predict + + +class AttentionModel(Model): + def __init__(self, + src_vocab_size, + trg_vocab_size, + embed_dim, + hidden_size, + num_layers, + dropout_prob=0., + init_scale=0.1): + super(AttentionModel, self).__init__() + self.hidden_size = hidden_size + self.encoder = Encoder(src_vocab_size, embed_dim, hidden_size, + num_layers, dropout_prob, init_scale) + self.decoder = Decoder(trg_vocab_size, embed_dim, hidden_size, + num_layers, dropout_prob, init_scale) + + def forward(self, src, src_length, trg): + # encoder + encoder_output, encoder_final_state = self.encoder(src, src_length) + + # decoder initial states: use input_feed and the structure is + # [[h,c] * num_layers, input_feed], consistent with DecoderCell.states + decoder_initial_states = [ + encoder_final_state, + self.decoder.lstm_attention.cell.get_initial_states( + batch_ref=encoder_output, shape=[self.hidden_size]) + ] + # attention mask to avoid paying attention on padddings + src_mask = layers.sequence_mask( + src_length, + maxlen=layers.shape(src)[1], + dtype=encoder_output.dtype) + encoder_padding_mask = (src_mask - 1.0) * 1e9 + encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1]) + + # decoder with attentioon + predict = self.decoder(trg, decoder_initial_states, encoder_output, + encoder_padding_mask) + return predict + + +class AttentionInferModel(AttentionModel): + def __init__(self, + src_vocab_size, + trg_vocab_size, + embed_dim, + hidden_size, + num_layers, + dropout_prob=0., + bos_id=0, + eos_id=1, + beam_size=4, + max_out_len=256): + args = dict(locals()) + args.pop("self") + args.pop("__class__", None) # py3 + self.bos_id = args.pop("bos_id") + self.eos_id = args.pop("eos_id") + self.beam_size = args.pop("beam_size") + self.max_out_len = args.pop("max_out_len") + super(AttentionInferModel, self).__init__(**args) + # dynamic decoder for inference + decoder = BeamSearchDecoder( + self.decoder.lstm_attention.cell, + start_token=bos_id, + end_token=eos_id, + beam_size=beam_size, + embedding_fn=self.decoder.embedder, + output_fn=self.decoder.output_layer) + self.beam_search_decoder = DynamicDecode( + decoder, max_step_num=max_out_len, is_test=True) + + def forward(self, src, src_length): + # encoding + encoder_output, encoder_final_state = self.encoder(src, src_length) + + # decoder initial states + decoder_initial_states = [ + encoder_final_state, + self.decoder.lstm_attention.cell.get_initial_states( + batch_ref=encoder_output, shape=[self.hidden_size]) + ] + # attention mask to avoid paying attention on padddings + src_mask = layers.sequence_mask( + src_length, + maxlen=layers.shape(src)[1], + dtype=encoder_output.dtype) + encoder_padding_mask = (src_mask - 1.0) * 1e9 + encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1]) + + # Tile the batch dimension with beam_size + encoder_output = BeamSearchDecoder.tile_beam_merge_with_batch( + encoder_output, self.beam_size) + encoder_padding_mask = BeamSearchDecoder.tile_beam_merge_with_batch( + encoder_padding_mask, self.beam_size) + + # dynamic decoding with beam search + rs, _ = self.beam_search_decoder( + inits=decoder_initial_states, + encoder_output=encoder_output, + encoder_padding_mask=encoder_padding_mask) + return rs diff --git a/examples/seq2seq/seq2seq_base.py b/examples/seq2seq/seq2seq_base.py new file mode 100644 index 0000000000000000000000000000000000000000..c28e2dc52935526d69d78ae73bfd48c92528b93c --- /dev/null +++ b/examples/seq2seq/seq2seq_base.py @@ -0,0 +1,203 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +import paddle.fluid.layers as layers +from paddle.fluid import ParamAttr +from paddle.fluid.initializer import UniformInitializer +from paddle.fluid.dygraph import Embedding, Linear, Layer +from paddle.fluid.layers import BeamSearchDecoder + +from hapi.model import Model, Loss +from hapi.text import DynamicDecode, RNN, BasicLSTMCell, RNNCell + + +class CrossEntropyCriterion(Loss): + def __init__(self): + super(CrossEntropyCriterion, self).__init__() + + def forward(self, outputs, labels): + predict, (trg_length, label) = outputs[0], labels + # for target padding mask + mask = layers.sequence_mask( + trg_length, maxlen=layers.shape(predict)[1], dtype=predict.dtype) + + cost = layers.softmax_with_cross_entropy( + logits=predict, label=label, soft_label=False) + masked_cost = layers.elementwise_mul(cost, mask, axis=0) + batch_mean_cost = layers.reduce_mean(masked_cost, dim=[0]) + seq_cost = layers.reduce_sum(batch_mean_cost) + return seq_cost + + +class EncoderCell(RNNCell): + def __init__(self, + num_layers, + input_size, + hidden_size, + dropout_prob=0., + init_scale=0.1): + super(EncoderCell, self).__init__() + self.dropout_prob = dropout_prob + # use add_sublayer to add multi-layers + self.lstm_cells = [] + for i in range(num_layers): + self.lstm_cells.append( + self.add_sublayer( + "lstm_%d" % i, + BasicLSTMCell( + input_size=input_size if i == 0 else hidden_size, + hidden_size=hidden_size, + param_attr=ParamAttr(initializer=UniformInitializer( + low=-init_scale, high=init_scale))))) + + def forward(self, step_input, states): + new_states = [] + for i, lstm_cell in enumerate(self.lstm_cells): + out, new_state = lstm_cell(step_input, states[i]) + step_input = layers.dropout( + out, + self.dropout_prob, + dropout_implementation='upscale_in_train' + ) if self.dropout_prob > 0 else out + new_states.append(new_state) + return step_input, new_states + + @property + def state_shape(self): + return [cell.state_shape for cell in self.lstm_cells] + + +class Encoder(Layer): + def __init__(self, + vocab_size, + embed_dim, + hidden_size, + num_layers, + dropout_prob=0., + init_scale=0.1): + super(Encoder, self).__init__() + self.embedder = Embedding( + size=[vocab_size, embed_dim], + param_attr=ParamAttr(initializer=UniformInitializer( + low=-init_scale, high=init_scale))) + self.stack_lstm = RNN(EncoderCell(num_layers, embed_dim, hidden_size, + dropout_prob, init_scale), + is_reverse=False, + time_major=False) + + def forward(self, sequence, sequence_length): + inputs = self.embedder(sequence) + encoder_output, encoder_state = self.stack_lstm( + inputs, sequence_length=sequence_length) + return encoder_output, encoder_state + + +DecoderCell = EncoderCell + + +class Decoder(Layer): + def __init__(self, + vocab_size, + embed_dim, + hidden_size, + num_layers, + dropout_prob=0., + init_scale=0.1): + super(Decoder, self).__init__() + self.embedder = Embedding( + size=[vocab_size, embed_dim], + param_attr=ParamAttr(initializer=UniformInitializer( + low=-init_scale, high=init_scale))) + self.stack_lstm = RNN(DecoderCell(num_layers, embed_dim, hidden_size, + dropout_prob, init_scale), + is_reverse=False, + time_major=False) + self.output_layer = Linear( + hidden_size, + vocab_size, + param_attr=ParamAttr(initializer=UniformInitializer( + low=-init_scale, high=init_scale)), + bias_attr=False) + + def forward(self, target, decoder_initial_states): + inputs = self.embedder(target) + decoder_output, _ = self.stack_lstm( + inputs, initial_states=decoder_initial_states) + predict = self.output_layer(decoder_output) + return predict + + +class BaseModel(Model): + def __init__(self, + src_vocab_size, + trg_vocab_size, + embed_dim, + hidden_size, + num_layers, + dropout_prob=0., + init_scale=0.1): + super(BaseModel, self).__init__() + self.hidden_size = hidden_size + self.encoder = Encoder(src_vocab_size, embed_dim, hidden_size, + num_layers, dropout_prob, init_scale) + self.decoder = Decoder(trg_vocab_size, embed_dim, hidden_size, + num_layers, dropout_prob, init_scale) + + def forward(self, src, src_length, trg): + # encoder + encoder_output, encoder_final_states = self.encoder(src, src_length) + + # decoder + predict = self.decoder(trg, encoder_final_states) + return predict + + +class BaseInferModel(BaseModel): + def __init__(self, + src_vocab_size, + trg_vocab_size, + embed_dim, + hidden_size, + num_layers, + dropout_prob=0., + bos_id=0, + eos_id=1, + beam_size=4, + max_out_len=256): + args = dict(locals()) + args.pop("self") + args.pop("__class__", None) # py3 + self.bos_id = args.pop("bos_id") + self.eos_id = args.pop("eos_id") + self.beam_size = args.pop("beam_size") + self.max_out_len = args.pop("max_out_len") + super(BaseInferModel, self).__init__(**args) + # dynamic decoder for inference + decoder = BeamSearchDecoder( + self.decoder.stack_lstm.cell, + start_token=bos_id, + end_token=eos_id, + beam_size=beam_size, + embedding_fn=self.decoder.embedder, + output_fn=self.decoder.output_layer) + self.beam_search_decoder = DynamicDecode( + decoder, max_step_num=max_out_len, is_test=True) + + def forward(self, src, src_length): + # encoding + encoder_output, encoder_final_states = self.encoder(src, src_length) + # dynamic decoding with beam search + rs, _ = self.beam_search_decoder(inits=encoder_final_states) + return rs diff --git a/examples/seq2seq/train.py b/examples/seq2seq/train.py new file mode 100644 index 0000000000000000000000000000000000000000..b7dc7698e31b1b5b935a63de66ee632956d3b102 --- /dev/null +++ b/examples/seq2seq/train.py @@ -0,0 +1,88 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import random +from functools import partial + +import numpy as np +import paddle.fluid as fluid +from paddle.fluid.io import DataLoader + +from hapi.model import Input, set_device +from args import parse_args +from seq2seq_base import BaseModel, CrossEntropyCriterion +from seq2seq_attn import AttentionModel +from reader import create_data_loader +from utility import PPL, TrainCallback + + +def do_train(args): + device = set_device("gpu" if args.use_gpu else "cpu") + fluid.enable_dygraph(device) if args.eager_run else None + + if args.enable_ce: + fluid.default_main_program().random_seed = 102 + fluid.default_startup_program().random_seed = 102 + + # define model + inputs = [ + Input( + [None, None], "int64", name="src_word"), + Input( + [None], "int64", name="src_length"), + Input( + [None, None], "int64", name="trg_word"), + ] + labels = [ + Input( + [None], "int64", name="trg_length"), + Input( + [None, None, 1], "int64", name="label"), + ] + + # def dataloader + train_loader, eval_loader = create_data_loader(args, device) + + model_maker = AttentionModel if args.attention else BaseModel + model = model_maker(args.src_vocab_size, args.tar_vocab_size, + args.hidden_size, args.hidden_size, args.num_layers, + args.dropout) + grad_clip = fluid.clip.GradientClipByGlobalNorm( + clip_norm=args.max_grad_norm) + optimizer = fluid.optimizer.Adam( + learning_rate=args.learning_rate, + parameter_list=model.parameters(), + grad_clip=grad_clip) + + ppl_metric = PPL(reset_freq=100) # ppl for every 100 batches + model.prepare( + optimizer, + CrossEntropyCriterion(), + ppl_metric, + inputs=inputs, + labels=labels) + model.fit(train_data=train_loader, + eval_data=eval_loader, + epochs=args.max_epoch, + eval_freq=1, + save_freq=1, + save_dir=args.model_path, + callbacks=[TrainCallback(ppl_metric, args.log_freq)]) + + +if __name__ == "__main__": + args = parse_args() + do_train(args) diff --git a/examples/seq2seq/utility.py b/examples/seq2seq/utility.py new file mode 100644 index 0000000000000000000000000000000000000000..aa0dd4a461d24d8a7799ff47c9d63a65bf87d401 --- /dev/null +++ b/examples/seq2seq/utility.py @@ -0,0 +1,80 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import paddle.fluid as fluid + +from hapi.metrics import Metric +from hapi.callbacks import ProgBarLogger + + +class TrainCallback(ProgBarLogger): + def __init__(self, ppl, log_freq, verbose=2): + super(TrainCallback, self).__init__(log_freq, verbose) + self.ppl = ppl + + def on_train_begin(self, logs=None): + super(TrainCallback, self).on_train_begin(logs) + self.train_metrics = ["ppl"] # remove loss to not print it + + def on_epoch_begin(self, epoch=None, logs=None): + super(TrainCallback, self).on_epoch_begin(epoch, logs) + self.ppl.reset() + + def on_train_batch_end(self, step, logs=None): + logs["ppl"] = self.ppl.cal_acc_ppl(logs["loss"][0], logs["batch_size"]) + if step > 0 and step % self.ppl.reset_freq == 0: + self.ppl.reset() + super(TrainCallback, self).on_train_batch_end(step, logs) + + def on_eval_begin(self, logs=None): + super(TrainCallback, self).on_eval_begin(logs) + self.eval_metrics = ["ppl"] + self.ppl.reset() + + def on_eval_batch_end(self, step, logs=None): + logs["ppl"] = self.ppl.cal_acc_ppl(logs["loss"][0], logs["batch_size"]) + super(TrainCallback, self).on_eval_batch_end(step, logs) + + +class PPL(Metric): + def __init__(self, reset_freq=100, name=None): + super(PPL, self).__init__() + self._name = name or "ppl" + self.reset_freq = reset_freq + self.reset() + + def add_metric_op(self, pred, seq_length, label): + word_num = fluid.layers.reduce_sum(seq_length) + return word_num + + def update(self, word_num): + self.word_count += word_num + return word_num + + def reset(self): + self.total_loss = 0 + self.word_count = 0 + + def accumulate(self): + return self.word_count + + def name(self): + return self._name + + def cal_acc_ppl(self, batch_loss, batch_size): + self.total_loss += batch_loss * batch_size + ppl = math.exp(self.total_loss / self.word_count) + return ppl \ No newline at end of file diff --git a/transformer/README.md b/examples/transformer/README.md similarity index 99% rename from transformer/README.md rename to examples/transformer/README.md index 2c4c22b91788a091fc9c08e303e5bcae7d80a4de..0c785de8a262105a53386c2a6f417e1d499fba34 100644 --- a/transformer/README.md +++ b/examples/transformer/README.md @@ -201,7 +201,7 @@ python -u predict.py \ --special_token '' '' '' \ --predict_file gen_data/wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de \ --batch_size 32 \ - --init_from_params base_model_dygraph/step_100000/transformer \ + --init_from_params big_model_dygraph/step_100000/transformer \ --beam_size 5 \ --max_out_len 255 \ --output_file predict.txt \ diff --git a/transformer/gen_data.sh b/examples/transformer/gen_data.sh similarity index 100% rename from transformer/gen_data.sh rename to examples/transformer/gen_data.sh diff --git a/transformer/images/multi_head_attention.png b/examples/transformer/images/multi_head_attention.png similarity index 100% rename from transformer/images/multi_head_attention.png rename to examples/transformer/images/multi_head_attention.png diff --git a/transformer/images/transformer_network.png b/examples/transformer/images/transformer_network.png similarity index 100% rename from transformer/images/transformer_network.png rename to examples/transformer/images/transformer_network.png diff --git a/transformer/predict.py b/examples/transformer/predict.py similarity index 94% rename from transformer/predict.py rename to examples/transformer/predict.py index b83d5403486c1e661a939663bad154735b29b37e..a6e14314f523d78dee2f770e69a21ae808cd8ad1 100644 --- a/transformer/predict.py +++ b/examples/transformer/predict.py @@ -14,9 +14,6 @@ import logging import os -import six -import sys -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from functools import partial import numpy as np @@ -28,9 +25,9 @@ from paddle.fluid.layers.utils import flatten from utils.configure import PDConfig from utils.check import check_gpu, check_version -from model import Input, set_device +from hapi.model import Input, set_device from reader import prepare_infer_input, Seq2SeqDataset, Seq2SeqBatchSampler -from transformer import InferTransformer, position_encoding_init +from transformer import InferTransformer def post_process_seq(seq, bos_idx, eos_idx, output_bos=False, @@ -132,7 +129,7 @@ def do_predict(args): # TODO: use model.predict when support variant length f = open(args.output_file, "wb") for data in data_loader(): - finished_seq = transformer.test(inputs=flatten(data))[0] + finished_seq = transformer.test_batch(inputs=flatten(data))[0] finished_seq = np.transpose(finished_seq, [0, 2, 1]) for ins in finished_seq: for beam_idx, beam in enumerate(ins): diff --git a/transformer/reader.py b/examples/transformer/reader.py similarity index 97% rename from transformer/reader.py rename to examples/transformer/reader.py index c0d02dcfb5b526ff8407f9320f31836d42ae5e4b..f6891df960b66fb9b48bb65d36af46f4ec601fc9 100644 --- a/transformer/reader.py +++ b/examples/transformer/reader.py @@ -13,7 +13,7 @@ # limitations under the License. import glob -import six +import sys import os import io import itertools @@ -26,7 +26,7 @@ from paddle.io import BatchSampler, DataLoader, Dataset def create_data_loader(args, device): - data_loaders = [None, None] + data_loaders = [(None, None)] * 2 data_files = [args.training_file, args.validation_file ] if args.validation_file else [args.training_file] for i, data_file in enumerate(data_files): @@ -65,7 +65,7 @@ def create_data_loader(args, device): n_head=args.n_head), num_workers=0, # TODO: use multi-process return_list=True) - data_loaders[i] = data_loader + data_loaders[i] = (data_loader, batch_sampler.__len__) return data_loaders @@ -289,7 +289,6 @@ class Seq2SeqDataset(Dataset): start_mark="", end_mark="", unk_mark="", - only_src=False, trg_fpattern=None, byte_data=False): if byte_data: @@ -477,6 +476,7 @@ class Seq2SeqBatchSampler(BatchSampler): for i in range(self._nranks) ] for batch in batches] batches = list(itertools.chain.from_iterable(batches)) + self.batch_number = (len(batches) + self._nranks - 1) // self._nranks # for multi-device for batch_id, batch in enumerate(batches): @@ -490,11 +490,13 @@ class Seq2SeqBatchSampler(BatchSampler): yield batch_indices def __len__(self): + if hasattr(self, "batch_number"): # + return self.batch_number if not self._use_token_batch: batch_number = ( len(self._dataset) + self._batch_size * self._nranks - 1) // ( self._batch_size * self._nranks) else: - # TODO(guosheng): fix the uncertain length - batch_number = 1 + # for uncertain batch number, the actual value is self.batch_number + batch_number = sys.maxsize return batch_number diff --git a/transformer/train.py b/examples/transformer/train.py similarity index 83% rename from transformer/train.py rename to examples/transformer/train.py index 04a61f83a0191a944d9b2611b3bca61f0bcf2a0a..94b52b4423839a0d7e01f0243cbb3d0f5907a4b0 100644 --- a/transformer/train.py +++ b/examples/transformer/train.py @@ -14,9 +14,6 @@ import logging import os -import six -import sys -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import numpy as np import paddle @@ -26,14 +23,18 @@ from paddle.io import DataLoader from utils.configure import PDConfig from utils.check import check_gpu, check_version -from model import Input, set_device -from callbacks import ProgBarLogger +from hapi.model import Input, set_device +from hapi.callbacks import ProgBarLogger from reader import create_data_loader from transformer import Transformer, CrossEntropyCriterion class TrainCallback(ProgBarLogger): - def __init__(self, args, verbose=2): + def __init__(self, + args, + verbose=2, + train_steps_fn=None, + eval_steps_fn=None): # TODO(guosheng): save according to step super(TrainCallback, self).__init__(args.print_step, verbose) # the best cross-entropy value with label smoothing @@ -42,11 +43,17 @@ class TrainCallback(ProgBarLogger): (1. - args.label_smooth_eps)) + args.label_smooth_eps * np.log(args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20)) self.loss_normalizer = loss_normalizer + self.train_steps_fn = train_steps_fn + self.eval_steps_fn = eval_steps_fn def on_train_begin(self, logs=None): super(TrainCallback, self).on_train_begin(logs) self.train_metrics += ["normalized loss", "ppl"] + def on_train_batch_begin(self, step, logs=None): + if step == 0 and self.train_steps_fn: + self.train_progbar._num = self.train_steps_fn() + def on_train_batch_end(self, step, logs=None): logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer logs["ppl"] = np.exp(min(logs["loss"][0], 100)) @@ -57,6 +64,10 @@ class TrainCallback(ProgBarLogger): self.eval_metrics = list( self.eval_metrics) + ["normalized loss", "ppl"] + def on_eval_batch_begin(self, step, logs=None): + if step == 0 and self.eval_steps_fn: + self.eval_progbar._num = self.eval_steps_fn() + def on_eval_batch_end(self, step, logs=None): logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer logs["ppl"] = np.exp(min(logs["loss"][0], 100)) @@ -104,7 +115,8 @@ def do_train(args): ] # def dataloader - train_loader, eval_loader = create_data_loader(args, device) + (train_loader, train_steps_fn), ( + eval_loader, eval_steps_fn) = create_data_loader(args, device) # define model transformer = Transformer( @@ -142,7 +154,12 @@ def do_train(args): eval_freq=1, save_freq=1, save_dir=args.save_model, - callbacks=[TrainCallback(args)]) + callbacks=[ + TrainCallback( + args, + train_steps_fn=train_steps_fn, + eval_steps_fn=eval_steps_fn) + ]) if __name__ == "__main__": diff --git a/transformer/transformer.py b/examples/transformer/transformer.py similarity index 99% rename from transformer/transformer.py rename to examples/transformer/transformer.py index 9caf4b04a1a34c5e856a789fbded8a53e917a3da..30bb931d28c3b52467f484f4cb14b5d5601c76d9 100644 --- a/transformer/transformer.py +++ b/examples/transformer/transformer.py @@ -20,8 +20,8 @@ import paddle.fluid as fluid import paddle.fluid.layers as layers from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, to_variable from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay -from model import Model, CrossEntropy, Loss -from text import TransformerBeamSearchDecoder, DynamicDecode +from hapi.model import Model, CrossEntropy, Loss +from hapi.text import TransformerBeamSearchDecoder, DynamicDecode def position_encoding_init(n_position, d_pos_vec): diff --git a/transformer/transformer.yaml b/examples/transformer/transformer.yaml similarity index 100% rename from transformer/transformer.yaml rename to examples/transformer/transformer.yaml diff --git a/transformer/utils/__init__.py b/examples/transformer/utils/__init__.py similarity index 100% rename from transformer/utils/__init__.py rename to examples/transformer/utils/__init__.py diff --git a/transformer/utils/check.py b/examples/transformer/utils/check.py similarity index 100% rename from transformer/utils/check.py rename to examples/transformer/utils/check.py diff --git a/transformer/utils/configure.py b/examples/transformer/utils/configure.py similarity index 95% rename from transformer/utils/configure.py rename to examples/transformer/utils/configure.py index 67e601282fee572518435eaed38a4ed8e26fc5f9..17dfaa53d8b44a68a2847c4bc1a1934384bb5f82 100644 --- a/transformer/utils/configure.py +++ b/examples/transformer/utils/configure.py @@ -195,13 +195,19 @@ class PDConfig(object): "Whether to perform predicting.") self.default_g.add_arg("do_eval", bool, False, "Whether to perform evaluating.") - self.default_g.add_arg("do_save_inference_model", bool, False, - "Whether to perform model saving for inference.") + self.default_g.add_arg( + "do_save_inference_model", bool, False, + "Whether to perform model saving for inference.") # NOTE: args for profiler - self.default_g.add_arg("is_profiler", int, 0, "the switch of profiler tools. (used for benchmark)") - self.default_g.add_arg("profiler_path", str, './', "the profiler output file path. (used for benchmark)") - self.default_g.add_arg("max_iter", int, 0, "the max train batch num.(used for benchmark)") + self.default_g.add_arg( + "is_profiler", int, 0, + "the switch of profiler tools. (used for benchmark)") + self.default_g.add_arg( + "profiler_path", str, './', + "the profiler output file path. (used for benchmark)") + self.default_g.add_arg("max_iter", int, 0, + "the max train batch num.(used for benchmark)") self.parser = parser diff --git a/hapi/callbacks.py b/hapi/callbacks.py index f02eec1ac7b20fe3d5ec771493378b4e74cc3796..62d6402941d0ab0e8af1b3efb3dd77d8ad05604d 100644 --- a/hapi/callbacks.py +++ b/hapi/callbacks.py @@ -215,13 +215,13 @@ class ProgBarLogger(Callback): if self.train_step % self.log_freq == 0 and self.verbose and ParallelEnv( ).local_rank == 0: - # if steps is not None, last step will update in on_epoch_end - if self.steps and self.train_step < self.steps: + if self.steps is None or self.train_step < self.steps: self._updates(logs, 'train') def on_epoch_end(self, epoch, logs=None): logs = logs or {} - if self.verbose and ParallelEnv().local_rank == 0: + if self.train_step % self.log_freq != 0 and self.verbose and ParallelEnv( + ).local_rank == 0: self._updates(logs, 'train') def on_eval_begin(self, logs=None): @@ -242,14 +242,14 @@ class ProgBarLogger(Callback): if self.eval_step % self.log_freq == 0 and self.verbose and ParallelEnv( ).local_rank == 0: - # if steps is not None, last step will update in on_epoch_end - if self.eval_steps and self.eval_step < self.eval_steps: + if self.eval_steps is None or self.eval_step < self.eval_steps: self._updates(logs, 'eval') def on_eval_end(self, logs=None): logs = logs or {} if self.verbose and ParallelEnv().local_rank == 0: - self._updates(logs, 'eval') + if self.eval_step % self.log_freq != 0: + self._updates(logs, 'eval') print('Eval samples: %d' % (self.evaled_samples)) diff --git a/hapi/model.py b/hapi/model.py index b9dc4ca441e8c2531df633b946e5c4da30bffa44..cde4ba6040be334f3ba902413ebf6953e2f35140 100644 --- a/hapi/model.py +++ b/hapi/model.py @@ -576,14 +576,15 @@ class DynamicGraphAdapter(object): if labels is not None: labels = [to_variable(l) for l in to_list(labels)] if self._nranks > 1: - outputs = self.ddp_model.forward(*[to_variable(x) for x in inputs]) + outputs = self.ddp_model.forward( + * [to_variable(x) for x in inputs]) losses = self.model._loss_function(outputs, labels) final_loss = fluid.layers.sum(losses) final_loss = self.ddp_model.scale_loss(final_loss) final_loss.backward() self.ddp_model.apply_collective_grads() else: - outputs = self.model.forward(*[to_variable(x) for x in inputs]) + outputs = self.model.forward(* [to_variable(x) for x in inputs]) losses = self.model._loss_function(outputs, labels) final_loss = fluid.layers.sum(losses) final_loss.backward() @@ -592,9 +593,9 @@ class DynamicGraphAdapter(object): self.model.clear_gradients() metrics = [] for metric in self.model._metrics: - metric_outs = metric.add_metric_op( - *(to_list(outputs) + to_list(labels))) - m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)]) + metric_outs = metric.add_metric_op(*(to_list(outputs) + to_list( + labels))) + m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)]) metrics.append(m) return ([to_numpy(l) for l in losses], metrics) \ @@ -606,7 +607,7 @@ class DynamicGraphAdapter(object): inputs = to_list(inputs) if labels is not None: labels = [to_variable(l) for l in to_list(labels)] - outputs = self.model.forward(*[to_variable(x) for x in inputs]) + outputs = self.model.forward(* [to_variable(x) for x in inputs]) if self.model._loss_function: losses = self.model._loss_function(outputs, labels) else: @@ -632,9 +633,9 @@ class DynamicGraphAdapter(object): self._merge_count[self.mode + '_total'] += samples self._merge_count[self.mode + '_batch'] = samples - metric_outs = metric.add_metric_op( - *(to_list(outputs) + to_list(labels))) - m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)]) + metric_outs = metric.add_metric_op(*(to_list(outputs) + to_list( + labels))) + m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)]) metrics.append(m) # To be consistent with static graph @@ -1009,7 +1010,7 @@ class Model(fluid.dygraph.Layer): do_eval = eval_loader is not None self._test_dataloader = eval_loader metrics_name = self._metrics_name() - steps = len(train_loader) if hasattr(train_loader, '__len__') else None + steps = self._len_data_loader(train_loader) cbks = config_callbacks( callbacks, model=self, @@ -1037,8 +1038,7 @@ class Model(fluid.dygraph.Layer): if not isinstance(eval_loader, Iterable): loader = eval_loader() - eval_steps = len(loader) if hasattr(loader, - '__len__') else None + eval_steps = self._len_data_loader(loader) cbks.on_begin('eval', { 'steps': eval_steps, 'metrics_name': metrics_name @@ -1114,7 +1114,7 @@ class Model(fluid.dygraph.Layer): if not isinstance(eval_loader, Iterable): loader = eval_loader() - eval_steps = len(loader) if hasattr(loader, '__len__') else None + eval_steps = self._len_data_loader(loader) cbks.on_begin('eval', {'steps': eval_steps, 'metrics_name': metrics_name}) @@ -1214,7 +1214,7 @@ class Model(fluid.dygraph.Layer): mode, metrics_name, epoch=None): - size = len(data_loader) if hasattr(data_loader, '__len__') else None + size = self._len_data_loader(data_loader) logs = { 'steps': size, 'metrics_name': metrics_name, @@ -1289,3 +1289,10 @@ class Model(fluid.dygraph.Layer): for m in self._metrics: metrics_name.extend(to_list(m.name())) return metrics_name + + def _len_data_loader(self, data_loader): + try: + steps = len(data_loader) + except Exception: + steps = None + return steps diff --git a/hapi/text/text.py b/hapi/text/text.py index e5be32bcb531b938c3cc8c21ec7caf2a4f40ee6e..ee74c516437a366e1dd91cde236346ecf2e1b787 100644 --- a/hapi/text/text.py +++ b/hapi/text/text.py @@ -238,8 +238,9 @@ class BasicLSTMCell(RNNCell): self._bias_attr = bias_attr self._gate_activation = gate_activation or layers.sigmoid self._activation = activation or layers.tanh - self._forget_bias = layers.fill_constant( - [1], dtype=dtype, value=forget_bias) + # TODO(guosheng): find better way to resolve constants in __init__ + self._forget_bias = layers.create_global_var( + shape=[1], dtype=dtype, value=forget_bias, persistable=True) self._forget_bias.stop_gradient = False self._dtype = dtype self._input_size = input_size @@ -817,7 +818,7 @@ class RNN(fluid.dygraph.Layer): lambda x: fluid.layers.transpose(x, [1, 0] + list( range(2, len(x.shape)))), inputs) - if sequence_length: + if sequence_length is not None: mask = fluid.layers.sequence_mask( sequence_length, maxlen=time_steps, @@ -828,7 +829,7 @@ class RNN(fluid.dygraph.Layer): inputs = map_structure( lambda x: fluid.layers.reverse(x, axis=[0]), inputs) mask = fluid.layers.reverse( - mask, axis=[0]) if sequence_length else None + mask, axis=[0]) if sequence_length is not None else None states = initial_states outputs = [] @@ -836,7 +837,7 @@ class RNN(fluid.dygraph.Layer): step_inputs = map_structure(lambda x: x[i], inputs) step_outputs, new_states = self.cell(step_inputs, states, **kwargs) - if sequence_length: + if sequence_length is not None: new_states = map_structure( partial( _maybe_copy, step_mask=mask[i]),