Merge pull request #33 from xyzhou-puck/master

add bert, refine text.py

Merge pull request #33 from xyzhou-puck/master
add bert, refine text.py
4bf36628 · pkpk · GitHub · d90f7cc6 · 1373e294 · 4bf36628
36 changed file
--- a/examples/bert/bert.yaml
+++ b/examples/bert/bert.yaml
+bert_config_path: "./config/bert_config.json"
+init_checkpoint: None
+init_pretraining_params: None
+checkpoints: "./saved_model"
+epoch: 3
+learning_rate: 0.0001
+lr_scheduler: "linear_warmup_decay"
+weight_decay: 0.01
+warmup_proportion: 0.1
+save_steps: 100000
+validation_steps: 100000
+loss_scaling: 1.0
+skip_steps: 100
+data_dir: None
+vocab_path: None
+max_seq_len: 512
+batch_size: 32
+in_tokens: False
+do_lower_case: True
+random_seed: 5512
+use_cuda: True
+shuffle: True
+do_train: True
+do_test: True
+use_data_parallel: False
+verbose: False
+
--- a/examples/bert/bert_classifier.py
+++ b/examples/bert/bert_classifier.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT fine-tuning in Paddle Dygraph Mode."""
+
+import paddle.fluid as fluid
+from hapi.metrics import Accuracy
+from hapi.configure import Config
+from hapi.model import set_device, Model, SoftmaxWithCrossEntropy, Input
+
+from cls import ClsModelLayer
+import hapi.text.tokenizer.tokenization as tokenization
+from hapi.text.bert import Optimizer, BertConfig, BertDataLoader, BertInputExample
+
+
+def train():
+
+    config = Config(yaml_file="./bert.yaml")
+    config.build()
+    config.Print()
+
+    device = set_device("gpu" if config.use_cuda else "cpu")
+    fluid.enable_dygraph(device)
+
+    bert_config = BertConfig(config.bert_config_path)
+    bert_config.print_config()
+
+    trainer_count = fluid.dygraph.parallel.Env().nranks
+
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=config.vocab_path, do_lower_case=config.do_lower_case)
+
+    def mnli_line_processor(line_id, line):
+        if line_id == "0":
+            return None
+        uid = tokenization.convert_to_unicode(line[0])
+        text_a = tokenization.convert_to_unicode(line[8])
+        text_b = tokenization.convert_to_unicode(line[9])
+        label = tokenization.convert_to_unicode(line[-1])
+        if label not in ["contradiction", "entailment", "neutral"]:
+            label = "contradiction"
+        return BertInputExample(
+            uid=uid, text_a=text_a, text_b=text_b, label=label)
+
+    bert_dataloader = BertDataLoader(
+        "./data/glue_data/MNLI/train.tsv",
+        tokenizer, ["contradiction", "entailment", "neutral"],
+        max_seq_length=64,
+        batch_size=32,
+        line_processor=mnli_line_processor)
+
+    num_train_examples = len(bert_dataloader.dataset)
+    max_train_steps = config.epoch * num_train_examples // config.batch_size // trainer_count
+    warmup_steps = int(max_train_steps * config.warmup_proportion)
+
+    print("Trainer count: %d" % trainer_count)
+    print("Num train examples: %d" % num_train_examples)
+    print("Max train steps: %d" % max_train_steps)
+    print("Num warmup steps: %d" % warmup_steps)
+
+    inputs = [
+        Input(
+            [None, None], 'int64', name='src_ids'), Input(
+                [None, None], 'int64', name='pos_ids'), Input(
+                    [None, None], 'int64', name='sent_ids'), Input(
+                        [None, None], 'float32', name='input_mask')
+    ]
+
+    labels = [Input([None, 1], 'int64', name='label')]
+
+    cls_model = ClsModelLayer(
+        config,
+        bert_config,
+        len(["contradiction", "entailment", "neutral"]),
+        is_training=True,
+        return_pooled_out=True)
+
+    optimizer = Optimizer(
+        warmup_steps=warmup_steps,
+        num_train_steps=max_train_steps,
+        learning_rate=config.learning_rate,
+        model_cls=cls_model,
+        weight_decay=config.weight_decay,
+        scheduler=config.lr_scheduler,
+        loss_scaling=config.loss_scaling,
+        parameter_list=cls_model.parameters())
+
+    cls_model.prepare(
+        optimizer,
+        SoftmaxWithCrossEntropy(),
+        Accuracy(topk=(1, 2)),
+        inputs,
+        labels,
+        device=device)
+
+    cls_model.bert_layer.init_parameters(
+        config.init_pretraining_params, verbose=config.verbose)
+
+    cls_model.fit(train_data=bert_dataloader.dataloader, epochs=config.epoch)
+
+    return cls_model
+
+
+if __name__ == '__main__':
+    cls_model = train()
--- a/examples/bert/cls.py
+++ b/examples/bert/cls.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"dygraph transformer layers"
+
+import six
+import json
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Linear, Layer
+
+from hapi.text.bert import BertEncoder
+from hapi.model import Model
+
+
+class ClsModelLayer(Model):
+    """
+    classify model
+    """
+
+    def __init__(self,
+                 args,
+                 config,
+                 num_labels,
+                 is_training=True,
+                 return_pooled_out=True,
+                 use_fp16=False):
+        super(ClsModelLayer, self).__init__()
+        self.config = config
+        self.is_training = is_training
+        self.use_fp16 = use_fp16
+        self.loss_scaling = args.loss_scaling
+
+        self.bert_layer = BertEncoder(
+            config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
+
+        self.cls_fc = Linear(
+            input_dim=self.config["hidden_size"],
+            output_dim=num_labels,
+            param_attr=fluid.ParamAttr(
+                name="cls_out_w",
+                initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
+            bias_attr=fluid.ParamAttr(
+                name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
+
+    def forward(self, src_ids, position_ids, sentence_ids, input_mask):
+        """
+        forward
+        """
+
+        enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
+                                                     sentence_ids, input_mask)
+
+        cls_feats = fluid.layers.dropout(
+            x=next_sent_feat,
+            dropout_prob=0.1,
+            dropout_implementation="upscale_in_train")
+
+        logits = self.cls_fc(cls_feats)
+
+        return logits
--- a/examples/bert/run_classifier_single_gpu.sh
+++ b/examples/bert/run_classifier_single_gpu.sh
+#!/bin/bash
+BERT_BASE_PATH="./data/pretrained_models/uncased_L-12_H-768_A-12/"
+TASK_NAME='MNLI'
+DATA_PATH="./data/glue_data/MNLI/"
+CKPT_PATH="./data/saved_model/mnli_models"
+
+export CUDA_VISIBLE_DEVICES=0
+
+# start fine-tuning
+python3.7 bert_classifier.py\
+    --use_cuda true \
+    --do_train true \
+    --do_test true \
+    --batch_size 64 \
+    --init_pretraining_params ${BERT_BASE_PATH}/dygraph_params/ \
+    --data_dir ${DATA_PATH} \
+    --vocab_path ${BERT_BASE_PATH}/vocab.txt \
+    --checkpoints ${CKPT_PATH} \
+    --save_steps 1000 \
+    --weight_decay  0.01 \
+    --warmup_proportion 0.1 \
+    --validation_steps 100 \
+    --epoch 3 \
+    --max_seq_len 128 \
+    --bert_config_path ${BERT_BASE_PATH}/bert_config.json \
+    --learning_rate 5e-5 \
+    --skip_steps 10 \
+    --shuffle true
+
--- a/examples/bert_leveldb/bert.yaml
+++ b/examples/bert_leveldb/bert.yaml
+bert_config_path: "./config/bert_config.json"
+init_checkpoint: None
+init_pretraining_params: None
+checkpoints: "./saved_model"
+epoch: 3
+learning_rate: 0.0001
+lr_scheduler: "linear_warmup_decay"
+weight_decay: 0.01
+warmup_proportion: 0.1
+save_steps: 100000
+validation_steps: 100000
+loss_scaling: 1.0
+skip_steps: 100
+data_dir: None
+vocab_path: None
+max_seq_len: 512
+batch_size: 32
+in_tokens: False
+do_lower_case: True
+random_seed: 5512
+use_cuda: False
+shuffle: True
+do_train: True
+do_test: True
+use_data_parallel: False
+verbose: False
+
--- a/examples/bert_leveldb/bert_classifier.py
+++ b/examples/bert_leveldb/bert_classifier.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT fine-tuning in Paddle Dygraph Mode."""
+
+import paddle.fluid as fluid
+from hapi.metrics import Accuracy
+from hapi.configure import Config
+from hapi.model import set_device, Model, SoftmaxWithCrossEntropy, Input
+
+from cls import ClsModelLayer
+import hapi.text.tokenizer.tokenization as tokenization
+from hapi.text.bert import Optimizer, BertConfig, BertDataLoader, BertInputExample
+
+
+def train():
+
+    config = Config(yaml_file="./bert.yaml")
+    config.build()
+    config.Print()
+
+    device = set_device("gpu" if config.use_cuda else "cpu")
+    fluid.enable_dygraph(device)
+
+    bert_config = BertConfig(config.bert_config_path)
+    bert_config.print_config()
+
+    trainer_count = fluid.dygraph.parallel.Env().nranks
+
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=config.vocab_path, do_lower_case=config.do_lower_case)
+
+    def mnli_line_processor(line_id, line):
+        if line_id == "0":
+            return None
+        uid = tokenization.convert_to_unicode(line[0])
+        text_a = tokenization.convert_to_unicode(line[8])
+        text_b = tokenization.convert_to_unicode(line[9])
+        label = tokenization.convert_to_unicode(line[-1])
+        if label not in ["contradiction", "entailment", "neutral"]:
+            label = "contradiction"
+        return BertInputExample(
+            uid=uid, text_a=text_a, text_b=text_b, label=label)
+
+    bert_dataloader = BertDataLoader(
+        "./data/glue_data/MNLI/train.tsv",
+        tokenizer, ["contradiction", "entailment", "neutral"],
+        max_seq_length=64,
+        batch_size=32,
+        line_processor=mnli_line_processor,
+        mode="leveldb")
+
+    num_train_examples = len(bert_dataloader.dataset)
+    max_train_steps = config.epoch * num_train_examples // config.batch_size // trainer_count
+    warmup_steps = int(max_train_steps * config.warmup_proportion)
+
+    print("Trainer count: %d" % trainer_count)
+    print("Num train examples: %d" % num_train_examples)
+    print("Max train steps: %d" % max_train_steps)
+    print("Num warmup steps: %d" % warmup_steps)
+
+    inputs = [
+        Input(
+            [None, None], 'int64', name='src_ids'), Input(
+                [None, None], 'int64', name='pos_ids'), Input(
+                    [None, None], 'int64', name='sent_ids'), Input(
+                        [None, None], 'float32', name='input_mask')
+    ]
+
+    labels = [Input([None, 1], 'int64', name='label')]
+
+    cls_model = ClsModelLayer(
+        config,
+        bert_config,
+        len(["contradiction", "entailment", "neutral"]),
+        is_training=True,
+        return_pooled_out=True)
+
+    optimizer = Optimizer(
+        warmup_steps=warmup_steps,
+        num_train_steps=max_train_steps,
+        learning_rate=config.learning_rate,
+        model_cls=cls_model,
+        weight_decay=config.weight_decay,
+        scheduler=config.lr_scheduler,
+        loss_scaling=config.loss_scaling,
+        parameter_list=cls_model.parameters())
+
+    cls_model.prepare(
+        optimizer,
+        SoftmaxWithCrossEntropy(),
+        Accuracy(topk=(1, 2)),
+        inputs,
+        labels,
+        device=device)
+
+    cls_model.bert_layer.init_parameters(
+        config.init_pretraining_params, verbose=config.verbose)
+
+    cls_model.fit(train_data=bert_dataloader.dataloader, epochs=config.epoch)
+
+    return cls_model
+
+
+if __name__ == '__main__':
+    cls_model = train()
--- a/examples/bert_leveldb/cls.py
+++ b/examples/bert_leveldb/cls.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"dygraph transformer layers"
+
+import six
+import json
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Linear, Layer
+
+from hapi.text.bert import BertEncoder
+from hapi.model import Model
+
+
+class ClsModelLayer(Model):
+    """
+    classify model
+    """
+
+    def __init__(self,
+                 args,
+                 config,
+                 num_labels,
+                 is_training=True,
+                 return_pooled_out=True,
+                 use_fp16=False):
+        super(ClsModelLayer, self).__init__()
+        self.config = config
+        self.is_training = is_training
+        self.use_fp16 = use_fp16
+        self.loss_scaling = args.loss_scaling
+
+        self.bert_layer = BertEncoder(
+            config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
+
+        self.cls_fc = Linear(
+            input_dim=self.config["hidden_size"],
+            output_dim=num_labels,
+            param_attr=fluid.ParamAttr(
+                name="cls_out_w",
+                initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
+            bias_attr=fluid.ParamAttr(
+                name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
+
+    def forward(self, src_ids, position_ids, sentence_ids, input_mask):
+        """
+        forward
+        """
+
+        enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
+                                                     sentence_ids, input_mask)
+
+        cls_feats = fluid.layers.dropout(
+            x=next_sent_feat,
+            dropout_prob=0.1,
+            dropout_implementation="upscale_in_train")
+
+        logits = self.cls_fc(cls_feats)
+
+        return logits
--- a/examples/bert_leveldb/nohup.out
+++ b/examples/bert_leveldb/nohup.out
--- a/examples/bert_leveldb/run_classifier_single_gpu.sh
+++ b/examples/bert_leveldb/run_classifier_single_gpu.sh
+#!/bin/bash
+BERT_BASE_PATH="./data/pretrained_models/uncased_L-12_H-768_A-12/"
+TASK_NAME='MNLI'
+DATA_PATH="./data/glue_data/MNLI/"
+CKPT_PATH="./data/saved_model/mnli_models"
+
+export CUDA_VISIBLE_DEVICES=7
+
+# start fine-tuning
+python3.7 bert_classifier.py\
+    --use_cuda true \
+    --do_train true \
+    --do_test true \
+    --batch_size 64 \
+    --init_pretraining_params ${BERT_BASE_PATH}/dygraph_params/ \
+    --data_dir ${DATA_PATH} \
+    --vocab_path ${BERT_BASE_PATH}/vocab.txt \
+    --checkpoints ${CKPT_PATH} \
+    --save_steps 1000 \
+    --weight_decay  0.01 \
+    --warmup_proportion 0.1 \
+    --validation_steps 100 \
+    --epoch 3 \
+    --max_seq_len 128 \
+    --bert_config_path ${BERT_BASE_PATH}/bert_config.json \
+    --learning_rate 5e-5 \
+    --skip_steps 10 \
+    --shuffle true
+
--- a/hapi/__init__.py
+++ b/hapi/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from hapi.configure import Config as Config
--- a/hapi/callbacks.py
+++ b/hapi/callbacks.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+import copy
+
+from hapi.progressbar import ProgressBar
+from paddle.fluid.dygraph.parallel import ParallelEnv
+
+
+def config_callbacks(callbacks=None,
+                     model=None,
+                     batch_size=None,
+                     epochs=None,
+                     steps=None,
+                     log_freq=2,
+                     verbose=2,
+                     save_freq=1,
+                     save_dir=None,
+                     metrics=None,
+                     mode='train'):
+    cbks = callbacks or []
+    cbks = cbks if isinstance(cbks, (list, tuple)) else [cbks]
+    if not any(isinstance(k, ProgBarLogger) for k in cbks) and verbose:
+        cbks = cbks + [ProgBarLogger(log_freq, verbose=verbose)]
+
+    if not any(isinstance(k, ModelCheckpoint) for k in cbks):
+        cbks = cbks + [ModelCheckpoint(save_freq, save_dir)]
+
+    cbk_list = CallbackList(cbks)
+    cbk_list.set_model(model)
+    metrics = metrics or [] if mode != 'test' else []
+    params = {
+        'batch_size': batch_size,
+        'epochs': epochs,
+        'steps': steps,
+        'verbose': verbose,
+        'metrics': metrics,
+    }
+    cbk_list.set_params(params)
+    return cbk_list
+
+
+class CallbackList(object):
+    def __init__(self, callbacks=None):
+        # copy
+        self.callbacks = [c for c in callbacks]
+        self.params = {}
+        self.model = None
+
+    def append(self, callback):
+        self.callbacks.append(callback)
+
+    def __iter__(self):
+        return iter(self.callbacks)
+
+    def set_params(self, params):
+        for c in self.callbacks:
+            c.set_params(params)
+
+    def set_model(self, model):
+        for c in self.callbacks:
+            c.set_model(model)
+
+    def _call(self, name, *args):
+        for c in self.callbacks:
+            func = getattr(c, name)
+            func(*args)
+
+    def _check_mode(self, mode):
+        assert mode in ['train', 'eval', 'test'], \
+            'mode should be train, eval or test'
+
+    def on_begin(self, mode, logs=None):
+        self._check_mode(mode)
+        name = 'on_{}_begin'.format(mode)
+        self._call(name, logs)
+
+    def on_end(self, mode, logs=None):
+        self._check_mode(mode)
+        name = 'on_{}_end'.format(mode)
+        self._call(name, logs)
+
+    def on_epoch_begin(self, epoch=None, logs=None):
+        self._call('on_epoch_begin', epoch, logs)
+
+    def on_epoch_end(self, epoch=None, logs=None):
+        self._call('on_epoch_end', epoch, logs)
+
+    def on_batch_begin(self, mode, step=None, logs=None):
+        self._check_mode(mode)
+        name = 'on_{}_batch_begin'.format(mode)
+        self._call(name, step, logs)
+
+    def on_batch_end(self, mode, step=None, logs=None):
+        self._check_mode(mode)
+        name = 'on_{}_batch_end'.format(mode)
+        self._call(name, step, logs)
+
+
+class Callback(object):
+    def __init__(self):
+        self.model = None
+        self.params = {}
+
+    def set_params(self, params):
+        self.params = params
+
+    def set_model(self, model):
+        self.model = model
+
+    def on_train_begin(self, logs=None):
+        """
+        """
+
+    def on_train_end(self, logs=None):
+        """
+        """
+
+    def on_eval_begin(self, logs=None):
+        """
+        """
+
+    def on_eval_end(self, logs=None):
+        """
+        """
+
+    def on_test_begin(self, logs=None):
+        """
+        """
+
+    def on_test_end(self, logs=None):
+        """
+        """
+
+    def on_epoch_begin(self, epoch, logs=None):
+        """
+        """
+
+    def on_epoch_end(self, epoch, logs=None):
+        """
+        """
+
+    def on_train_batch_begin(self, step, logs=None):
+        """
+        """
+
+    def on_train_batch_end(self, step, logs=None):
+        """
+        """
+
+    def on_eval_batch_begin(self, step, logs=None):
+        """
+        """
+
+    def on_eval_batch_end(self, step, logs=None):
+        """
+        """
+
+    def on_eval_batch_begin(self, step, logs=None):
+        """
+        """
+
+    def on_eval_batch_end(self, step, logs=None):
+        """
+        """
+
+
+class ProgBarLogger(Callback):
+    def __init__(self, log_freq=1, verbose=2):
+        self.epochs = None
+        self.steps = None
+        self.progbar = None
+        self.verbose = verbose
+        self.log_freq = log_freq
+
+    def on_train_begin(self, logs=None):
+        self.epochs = self.params['epochs']
+        assert self.epochs
+        self.train_metrics = self.params['metrics']
+        assert self.train_metrics
+
+    def on_epoch_begin(self, epoch=None, logs=None):
+        self.steps = self.params['steps']
+        self.epoch = epoch
+        self.train_step = 0
+        if self.verbose and self.epochs and ParallelEnv().local_rank == 0:
+            print('Epoch %d/%d' % (epoch + 1, self.epochs))
+        self.train_progbar = ProgressBar(num=self.steps, verbose=self.verbose)
+
+    def _updates(self, logs, mode):
+        values = []
+        metrics = getattr(self, '%s_metrics' % (mode))
+        progbar = getattr(self, '%s_progbar' % (mode))
+        steps = getattr(self, '%s_step' % (mode))
+        for k in metrics:
+            if k in logs:
+                values.append((k, logs[k]))
+        progbar.update(steps, values)
+
+    def on_train_batch_end(self, step, logs=None):
+        logs = logs or {}
+        self.train_step += 1
+
+        if self.train_step % self.log_freq == 0 and self.verbose and ParallelEnv(
+        ).local_rank == 0:
+            # if steps is not None, last step will update in on_epoch_end
+            if self.steps and self.train_step < self.steps:
+                self._updates(logs, 'train')
+            else:
+                self._updates(logs, 'train')
+
+    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
+        if self.verbose and ParallelEnv().local_rank == 0:
+            self._updates(logs, 'train')
+
+    def on_eval_begin(self, logs=None):
+        self.eval_steps = logs.get('steps', None)
+        self.eval_metrics = logs.get('metrics_name', [])
+        self.eval_step = 0
+        self.evaled_samples = 0
+        self.eval_progbar = ProgressBar(
+            num=self.eval_steps, verbose=self.verbose)
+        if ParallelEnv().local_rank == 0:
+            print('Eval begin...')
+
+    def on_eval_batch_end(self, step, logs=None):
+        logs = logs or {}
+        self.eval_step = step
+        samples = logs.get('batch_size', 1)
+        self.evaled_samples += samples
+
+        if self.eval_step % self.log_freq == 0 and self.verbose and ParallelEnv(
+        ).local_rank == 0:
+            # if steps is not None, last step will update in on_epoch_end
+            if self.eval_steps and self.eval_step < self.eval_steps:
+                self._updates(logs, 'eval')
+
+    def on_eval_end(self, logs=None):
+        logs = logs or {}
+        if self.verbose and ParallelEnv().local_rank == 0:
+            self._updates(logs, 'eval')
+            print('Eval samples: %d' % (self.evaled_samples))
+
+
+class ModelCheckpoint(Callback):
+    def __init__(self, save_freq=1, save_dir=None):
+        self.save_freq = save_freq
+        self.save_dir = save_dir
+
+    def on_epoch_begin(self, epoch=None, logs=None):
+        self.epoch = epoch
+
+    def _is_save(self):
+        return self.model and self.save_dir and ParallelEnv().local_rank == 0
+
+    def on_epoch_end(self, epoch, logs=None):
+        if self._is_save() and self.epoch % self.save_freq == 0:
+            path = '{}/{}'.format(self.save_dir, epoch)
+            print('save checkpoint at {}'.format(path))
+            self.model.save(path)
+
+    def on_train_end(self, logs=None):
+        if self._is_save():
+            path = '{}/final'.format(self.save_dir)
+            print('save checkpoint at {}'.format(path))
+            self.model.save(path)
--- a/hapi/configure.py
+++ b/hapi/configure.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import argparse
+import json
+import yaml
+import six
+import logging
+
+logging_only_message = "%(message)s"
+logging_details = "%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s"
+
+
+class JsonConfig(object):
+    """
+    A high-level api for handling json configure file.
+    """
+
+    def __init__(self, config_path):
+        self._config_dict = self._parse(config_path)
+
+    def _parse(self, config_path):
+        try:
+            with open(config_path) as json_file:
+                config_dict = json.load(json_file)
+        except:
+            raise IOError("Error in parsing bert model config file '%s'" %
+                          config_path)
+        else:
+            return config_dict
+
+    def __getitem__(self, key):
+        return self._config_dict[key]
+
+    def print_config(self):
+        for arg, value in sorted(six.iteritems(self._config_dict)):
+            print('%s: %s' % (arg, value))
+        print('------------------------------------------------')
+
+
+class ArgumentGroup(object):
+    def __init__(self, parser, title, des):
+        self._group = parser.add_argument_group(title=title, description=des)
+
+    def add_arg(self, name, type, default, help, **kwargs):
+        type = str2bool if type == bool else type
+        self._group.add_argument(
+            "--" + name,
+            default=default,
+            type=type,
+            help=help + ' Default: %(default)s.',
+            **kwargs)
+
+
+class ArgConfig(object):
+    """
+    A high-level api for handling argument configs.
+    """
+
+    def __init__(self):
+        parser = argparse.ArgumentParser()
+
+        custom_g = ArgumentGroup(parser, "customize", "customized options.")
+
+        self.custom_g = custom_g
+
+        self.parser = parser
+
+    def add_arg(self, name, dtype, default, descrip):
+        self.custom_g.add_arg(name, dtype, default, descrip)
+
+    def build_conf(self):
+        return self.parser.parse_args()
+
+
+def str2bool(v):
+    # because argparse does not support to parse "true, False" as python
+    # boolean directly
+    return v.lower() in ("true", "t", "1")
+
+
+def print_arguments(args, log=None):
+    if not log:
+        print('-----------  Configuration Arguments -----------')
+        for arg, value in sorted(six.iteritems(vars(args))):
+            print('%s: %s' % (arg, value))
+        print('------------------------------------------------')
+    else:
+        log.info('-----------  Configuration Arguments -----------')
+        for arg, value in sorted(six.iteritems(vars(args))):
+            log.info('%s: %s' % (arg, value))
+        log.info('------------------------------------------------')
+
+
+class Config(object):
+    """
+    A high-level API for managing configuration files in PaddlePaddle.
+    Can jointly work with command-line-arugment, json files and yaml files.
+    """
+
+    def __init__(self, json_file="", yaml_file="", fuse_args=True):
+        """
+            Init funciton for PDConfig.
+            json_file: the path to the json configure file.
+            yaml_file: the path to the yaml configure file.
+            fuse_args: if fuse the json/yaml configs with argparse.
+        """
+        assert isinstance(json_file, str)
+        assert isinstance(yaml_file, str)
+
+        if json_file != "" and yaml_file != "":
+            raise Warning(
+                "json_file and yaml_file can not co-exist for now. please only use one configure file type."
+            )
+            return
+
+        self.args = None
+        self.arg_config = {}
+        self.json_config = {}
+        self.yaml_config = {}
+
+        parser = argparse.ArgumentParser()
+
+        self.default_g = ArgumentGroup(parser, "default", "default options.")
+        self.yaml_g = ArgumentGroup(parser, "yaml", "options from yaml.")
+        self.json_g = ArgumentGroup(parser, "json", "options from json.")
+        self.com_g = ArgumentGroup(parser, "custom", "customized options.")
+
+        self.parser = parser
+
+        if json_file != "":
+            self.load_json(json_file, fuse_args=fuse_args)
+
+        if yaml_file:
+            self.load_yaml(yaml_file, fuse_args=fuse_args)
+
+    def load_json(self, file_path, fuse_args=True):
+
+        if not os.path.exists(file_path):
+            raise Warning("the json file %s does not exist." % file_path)
+            return
+
+        with open(file_path, "r") as fin:
+            self.json_config = json.loads(fin.read())
+            fin.close()
+
+        if fuse_args:
+            for name in self.json_config:
+                if isinstance(self.json_config[name], list):
+                    self.json_g.add_arg(
+                        name,
+                        type(self.json_config[name][0]),
+                        self.json_config[name],
+                        "This is from %s" % file_path,
+                        nargs=len(self.json_config[name]))
+                    continue
+                if not isinstance(self.json_config[name], int) \
+                    and not isinstance(self.json_config[name], float) \
+                    and not isinstance(self.json_config[name], str) \
+                    and not isinstance(self.json_config[name], bool):
+
+                    continue
+
+                self.json_g.add_arg(name,
+                                    type(self.json_config[name]),
+                                    self.json_config[name],
+                                    "This is from %s" % file_path)
+
+    def load_yaml(self, file_path, fuse_args=True):
+
+        if not os.path.exists(file_path):
+            raise Warning("the yaml file %s does not exist." % file_path)
+            return
+
+        with open(file_path, "r") as fin:
+            self.yaml_config = yaml.load(fin, Loader=yaml.SafeLoader)
+            fin.close()
+
+        if fuse_args:
+            for name in self.yaml_config:
+                if isinstance(self.yaml_config[name], list):
+                    self.yaml_g.add_arg(
+                        name,
+                        type(self.yaml_config[name][0]),
+                        self.yaml_config[name],
+                        "This is from %s" % file_path,
+                        nargs=len(self.yaml_config[name]))
+                    continue
+
+                if not isinstance(self.yaml_config[name], int) \
+                    and not isinstance(self.yaml_config[name], float) \
+                    and not isinstance(self.yaml_config[name], str) \
+                    and not isinstance(self.yaml_config[name], bool):
+
+                    continue
+
+                self.yaml_g.add_arg(name,
+                                    type(self.yaml_config[name]),
+                                    self.yaml_config[name],
+                                    "This is from %s" % file_path)
+
+    def build(self):
+        self.args = self.parser.parse_args()
+        self.arg_config = vars(self.args)
+
+    def __add__(self, new_arg):
+        assert isinstance(new_arg, list) or isinstance(new_arg, tuple)
+        assert len(new_arg) >= 3
+        assert self.args is None
+
+        name = new_arg[0]
+        dtype = new_arg[1]
+        dvalue = new_arg[2]
+        desc = new_arg[3] if len(
+            new_arg) == 4 else "Description is not provided."
+
+        self.com_g.add_arg(name, dtype, dvalue, desc)
+
+        return self
+
+    def __getattr__(self, name):
+        if name in self.arg_config:
+            return self.arg_config[name]
+
+        if name in self.json_config:
+            return self.json_config[name]
+
+        if name in self.yaml_config:
+            return self.yaml_config[name]
+
+        raise Warning("The argument %s is not defined." % name)
+
+    def Print(self):
+
+        print("-" * 70)
+        for name in self.arg_config:
+            print("%s:\t\t\t\t%s" % (str(name), str(self.arg_config[name])))
+
+        for name in self.json_config:
+            if name not in self.arg_config:
+                print("%s:\t\t\t\t%s" %
+                      (str(name), str(self.json_config[name])))
+
+        for name in self.yaml_config:
+            if name not in self.arg_config:
+                print("%s:\t\t\t\t%s" %
+                      (str(name), str(self.yaml_config[name])))
+
+        print("-" * 70)
+
+
+if __name__ == "__main__":
+    """
+    pd_config = PDConfig(json_file = "./test/bert_config.json")
+    pd_config.build()
+
+    print(pd_config.do_train)
+    print(pd_config.hidden_size)
+
+    pd_config = PDConfig(yaml_file = "./test/bert_config.yaml")
+    pd_config.build()
+
+    print(pd_config.do_train)
+    print(pd_config.hidden_size)
+    """
+
+    config = Config(yaml_file="./bert.yaml")
+    config += ("my_age", int, 18, "I am forever 18.")
+    config.build()
+
+    print(config.data_dir)
+    print(config.my_age)
--- a/hapi/distributed.py
+++ b/hapi/distributed.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import six
+import time
+import math
+import socket
+import contextlib
+import numpy as np
+
+from paddle import fluid
+from paddle.fluid.layers import collective
+from paddle.fluid.dygraph.parallel import ParallelEnv, ParallelStrategy
+from paddle.fluid.io import BatchSampler
+
+_parallel_context_initialized = False
+
+
+class DistributedBatchSampler(BatchSampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+
+    In such case, each process can pass a DistributedBatchSampler instance 
+    as a DataLoader sampler, and load a subset of the original dataset that 
+    is exclusive to it.
+
+    .. note::
+        Dataset is assumed to be of constant size.
+        
+    Args:
+        data_source: this could be a `fluid.io.Dataset` implement
+                     or other python object which implemented
+                     `__len__` for BatchSampler to get sample
+                     number of data source.
+        batch_size(int): sample indice number in a mini-batch indices.
+        shuffle(bool): whther to shuffle indices order before genrating
+            batch indices. Default False.
+        drop_last(bool): whether drop the last incomplete batch dataset size
+            is not divisible by the batch size. Default False
+    """
+
+    def __init__(self, dataset, batch_size, shuffle=False, drop_last=False):
+        self.dataset = dataset
+
+        assert isinstance(batch_size, int) and batch_size > 0, \
+                "batch_size should be a positive integer"
+        self.batch_size = batch_size
+        assert isinstance(shuffle, bool), \
+                "shuffle should be a boolean value"
+        self.shuffle = shuffle
+        assert isinstance(drop_last, bool), \
+                "drop_last should be a boolean number"
+
+        self.drop_last = drop_last
+        self.nranks = ParallelEnv().nranks
+        self.local_rank = ParallelEnv().local_rank
+        self.epoch = 0
+        self.num_samples = int(
+            math.ceil(len(self.dataset) * 1.0 / self.nranks))
+        self.total_size = self.num_samples * self.nranks
+
+    def __iter__(self):
+        num_samples = len(self.dataset)
+        indices = np.arange(num_samples).tolist()
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+        if self.shuffle:
+            np.random.RandomState(self.epoch).shuffle(indices)
+            self.epoch += 1
+
+        # subsample
+        def _get_indices_by_batch_size(indices):
+            subsampled_indices = []
+            last_batch_size = self.total_size % (self.batch_size * self.nranks)
+            assert last_batch_size % self.nranks == 0
+            last_local_batch_size = last_batch_size // self.nranks
+
+            for i in range(self.local_rank * self.batch_size,
+                           len(indices) - last_batch_size,
+                           self.batch_size * self.nranks):
+                subsampled_indices.extend(indices[i:i + self.batch_size])
+
+            indices = indices[len(indices) - last_batch_size:]
+            subsampled_indices.extend(indices[
+                self.local_rank * last_local_batch_size:(
+                    self.local_rank + 1) * last_local_batch_size])
+            return subsampled_indices
+
+        if self.nranks > 1:
+            indices = _get_indices_by_batch_size(indices)
+
+        assert len(indices) == self.num_samples
+        _sample_iter = iter(indices)
+
+        batch_indices = []
+        for idx in _sample_iter:
+            batch_indices.append(idx)
+            if len(batch_indices) == self.batch_size:
+                yield batch_indices
+                batch_indices = []
+        if not self.drop_last and len(batch_indices) > 0:
+            yield batch_indices
+
+    def __len__(self):
+        num_samples = self.num_samples
+        num_samples += int(not self.drop_last) * (self.batch_size - 1)
+        return num_samples // self.batch_size
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+
+def _all_gather(x, nranks, ring_id=0, use_calc_stream=True):
+    return collective._c_allgather(
+        x, nranks, ring_id=ring_id, use_calc_stream=use_calc_stream)
+
+
+def wait_server_ready(endpoints):
+    assert not isinstance(endpoints, six.string_types)
+    while True:
+        all_ok = True
+        not_ready_endpoints = []
+        for ep in endpoints:
+            ip_port = ep.split(":")
+            with contextlib.closing(
+                    socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+                sock.settimeout(2)
+                result = sock.connect_ex((ip_port[0], int(ip_port[1])))
+                if result != 0:
+                    all_ok = False
+                    not_ready_endpoints.append(ep)
+        if not all_ok:
+            time.sleep(3)
+        else:
+            break
+
+
+def init_communicator(program, rank, nranks, wait_port, current_endpoint,
+                      endpoints):
+    if nranks < 2:
+        return
+    other_endpoints = endpoints[:]
+    other_endpoints.remove(current_endpoint)
+    if rank == 0 and wait_port:
+        wait_server_ready(other_endpoints)
+    block = program.global_block()
+    nccl_id_var = block.create_var(
+        name=fluid.unique_name.generate('nccl_id'),
+        persistable=True,
+        type=fluid.core.VarDesc.VarType.RAW)
+
+    block.append_op(
+        type='c_gen_nccl_id',
+        inputs={},
+        outputs={'Out': nccl_id_var},
+        attrs={
+            'rank': rank,
+            'endpoint': current_endpoint,
+            'other_endpoints': other_endpoints
+        })
+
+    block.append_op(
+        type='c_comm_init',
+        inputs={'X': nccl_id_var},
+        outputs={},
+        attrs={
+            'nranks': nranks,
+            'rank': rank,
+            'ring_id': 0,
+        })
+
+
+def prepare_distributed_context(place=None):
+    if place is None:
+        place = fluid.CUDAPlace(ParallelEnv().dev_id) if ParallelEnv().nranks > 1 \
+            else fluid.CUDAPlace(0)
+
+    strategy = ParallelStrategy()
+    strategy.nranks = ParallelEnv().nranks
+    strategy.local_rank = ParallelEnv().local_rank
+    strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
+    strategy.current_endpoint = ParallelEnv().current_endpoint
+
+    if strategy.nranks < 2:
+        return
+
+    global _parallel_context_initialized
+
+    if not _parallel_context_initialized and isinstance(place,
+                                                        fluid.CUDAPlace):
+
+        def _init_context():
+            communicator_prog = fluid.Program()
+            init_communicator(communicator_prog, strategy.local_rank,
+                              strategy.nranks, True, strategy.current_endpoint,
+                              strategy.trainer_endpoints)
+            exe = fluid.Executor(place)
+            exe.run(communicator_prog)
+
+        if fluid.in_dygraph_mode():
+            fluid.disable_dygraph()
+            _init_context()
+            fluid.enable_dygraph(place)
+        else:
+            _init_context()
+
+    else:
+        assert ("Only support CUDAPlace for now.")
+
+    _parallel_context_initialized = True
+    return strategy
--- a/hapi/metrics.py
+++ b/hapi/metrics.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+
+import six
+import abc
+import numpy as np
+import paddle.fluid as fluid
+
+import logging
+FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+logger = logging.getLogger(__name__)
+
+__all__ = ['Metric', 'Accuracy']
+
+
+@six.add_metaclass(abc.ABCMeta)
+class Metric(object):
+    """
+    Base class for metric, encapsulates metric logic and APIs
+
+    Usage:
+    m = SomeMetric()
+    for prediction, label in ...:
+        m.update(prediction, label)
+    m.accumulate()
+    """
+
+    @abc.abstractmethod
+    def reset(self):
+        """
+        Reset states and result
+        """
+        raise NotImplementedError("function 'reset' not implemented in {}.".
+                                  format(self.__class__.__name__))
+
+    @abc.abstractmethod
+    def update(self, *args, **kwargs):
+        """
+        Update states for metric
+        """
+        raise NotImplementedError("function 'update' not implemented in {}.".
+                                  format(self.__class__.__name__))
+
+    @abc.abstractmethod
+    def accumulate(self):
+        """
+        Accumulates statistics, computes and returns the metric value
+        """
+        raise NotImplementedError(
+            "function 'accumulate' not implemented in {}.".format(
+                self.__class__.__name__))
+
+    @abc.abstractmethod
+    def name(self):
+        """
+        Returns metric name
+        """
+        raise NotImplementedError("function 'name' not implemented in {}.".
+                                  format(self.__class__.__name__))
+
+    def add_metric_op(self, pred, label):
+        """
+        Add process op for metric in program
+        """
+        return pred, label
+
+
+class Accuracy(Metric):
+    """
+    Encapsulates accuracy metric logic
+    """
+
+    def __init__(self, topk=(1, ), name=None, *args, **kwargs):
+        super(Accuracy, self).__init__(*args, **kwargs)
+        self.topk = topk
+        self.maxk = max(topk)
+        self._init_name(name)
+        self.reset()
+
+    def add_metric_op(self, pred, label, *args, **kwargs):
+        pred = fluid.layers.argsort(pred[0], descending=True)[1][:, :self.maxk]
+        correct = pred == label[0]
+        return correct
+
+    def update(self, correct, *args, **kwargs):
+        accs = []
+        for i, k in enumerate(self.topk):
+            num_corrects = correct[:, :k].sum()
+            num_samples = len(correct)
+            accs.append(float(num_corrects) / num_samples)
+            self.total[i] += num_corrects
+            self.count[i] += num_samples
+        return accs
+
+    def reset(self):
+        self.total = [0.] * len(self.topk)
+        self.count = [0] * len(self.topk)
+
+    def accumulate(self):
+        res = []
+        for t, c in zip(self.total, self.count):
+            res.append(float(t) / c)
+        return res
+
+    def _init_name(self, name):
+        name = name or 'acc'
+        if self.maxk != 1:
+            self._name = ['{}_top{}'.format(name, k) for k in self.topk]
+        else:
+            self._name = ['acc']
+
+    def name(self):
+        return self._name
--- a/hapi/model.py
+++ b/hapi/model.py
--- a/hapi/progressbar.py
+++ b/hapi/progressbar.py
+import sys
+import time
+import numpy as np
+
+
+class ProgressBar(object):
+    """progress bar """
+
+    def __init__(self,
+                 num=None,
+                 width=30,
+                 verbose=1,
+                 start=True,
+                 file=sys.stdout):
+        self._num = num
+        if isinstance(num, int) and num <= 0:
+            raise TypeError('num should be None or integer (> 0)')
+        max_width = self._get_max_width()
+        self._width = width if width <= max_width else max_width
+        self._total_width = 0
+        self._verbose = verbose
+        self.file = file
+        self._values = {}
+        self._values_order = []
+        if start:
+            self._start = time.time()
+        self._last_update = 0
+
+        self._dynamic_display = (
+            (hasattr(self.file, 'isatty') and
+             self.file.isatty()) or 'ipykernel' in sys.modules or
+            'posix' in sys.modules or 'PYCHARM_HOSTED' in os.environ)
+
+    def _get_max_width(self):
+        if sys.version_info > (3, 3):
+            from shutil import get_terminal_size
+        else:
+            from backports.shutil_get_terminal_size import get_terminal_size
+        terminal_width, _ = get_terminal_size()
+        max_width = min(int(terminal_width * 0.6), terminal_width - 50)
+        return max_width
+
+    def start(self):
+        self.file.flush()
+        self._start = time.time()
+
+    def update(self, current_num, values=None):
+        now = time.time()
+
+        if current_num:
+            time_per_unit = (now - self._start) / current_num
+        else:
+            time_per_unit = 0
+
+        if time_per_unit >= 1 or time_per_unit == 0:
+            fps = ' - %.0fs/%s' % (time_per_unit, 'step')
+        elif time_per_unit >= 1e-3:
+            fps = ' - %.0fms/%s' % (time_per_unit * 1e3, 'step')
+        else:
+            fps = ' - %.0fus/%s' % (time_per_unit * 1e6, 'step')
+
+        info = ''
+        if self._verbose == 1:
+            prev_total_width = self._total_width
+
+            if self._dynamic_display:
+                sys.stdout.write('\b' * prev_total_width)
+                sys.stdout.write('\r')
+            else:
+                sys.stdout.write('\n')
+
+            if self._num is not None:
+                numdigits = int(np.log10(self._num)) + 1
+
+                bar_chars = ('step %' + str(numdigits) + 'd/%d [') % (
+                    current_num, self._num)
+                prog = float(current_num) / self._num
+                prog_width = int(self._width * prog)
+
+                if prog_width > 0:
+                    bar_chars += ('=' * (prog_width - 1))
+                    if current_num < self._num:
+                        bar_chars += '>'
+                    else:
+                        bar_chars += '='
+                bar_chars += ('.' * (self._width - prog_width))
+                bar_chars += ']'
+            else:
+                bar_chars = 'step %3d' % current_num
+
+            self._total_width = len(bar_chars)
+            sys.stdout.write(bar_chars)
+
+            for k, val in values:
+                info += ' - %s:' % k
+                val = val if isinstance(val, list) else [val]
+                for i, v in enumerate(val):
+                    if isinstance(v, (float, np.float32, np.float64)):
+                        if abs(v) > 1e-3:
+                            info += ' %.4f' % v
+                        else:
+                            info += ' %.4e' % v
+                    else:
+                        info += ' %s' % v
+
+            if self._num is not None and current_num < self._num:
+                eta = time_per_unit * (self._num - current_num)
+                if eta > 3600:
+                    eta_format = '%d:%02d:%02d' % (eta // 3600, (eta % 3600) //
+                                                   60, eta % 60)
+                elif eta > 60:
+                    eta_format = '%d:%02d' % (eta // 60, eta % 60)
+                else:
+                    eta_format = '%ds' % eta
+
+                info += ' - ETA: %s' % eta_format
+
+            info += fps
+            self._total_width += len(info)
+            if prev_total_width > self._total_width:
+                info += (' ' * (prev_total_width - self._total_width))
+
+            # newline for another epoch
+            if self._num is not None and current_num >= self._num:
+                info += '\n'
+            if self._num is None:
+                info += '\n'
+
+            sys.stdout.write(info)
+            sys.stdout.flush()
+            self._last_update = now
+        elif self._verbose == 2:
+            if self._num:
+                numdigits = int(np.log10(self._num)) + 1
+                count = ('step %' + str(numdigits) + 'd/%d') % (current_num,
+                                                                self._num)
+            else:
+                count = 'step %3d' % current_num
+            info = count + info
+
+            for k, val in values:
+                info += ' - %s:' % k
+                val = val if isinstance(val, list) else [val]
+                for v in val:
+                    if isinstance(v, (float, np.float32, np.float64)):
+                        if abs(v) > 1e-3:
+                            info += ' %.4f' % v
+                        else:
+                            info += ' %.4e' % v
+                    elif isinstance(v, np.ndarray) and \
+                        v.size == 1 and \
+                        isinstance(v.dtype, (np.float32, np.float64)):
+                        if abs(v[0]) > 1e-3:
+                            info += ' %.4f' % v[0]
+                        else:
+                            info += ' %.4e' % v[0]
+                    else:
+                        info += ' %s' % v
+
+            info += fps
+            info += '\n'
+            sys.stdout.write(info)
+            sys.stdout.flush()
--- a/hapi/text/__init__.py
+++ b/hapi/text/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from hapi.text.text import RNNCell as RNNCell
+from hapi.text.text import BasicLSTMCell as BasicLSTMCell
+from hapi.text.text import BasicGRUCell as BasicGRUCell
+from hapi.text.text import RNN as RNN
+from hapi.text.text import DynamicDecode as DynamicDecode
+from hapi.text.text import BeamSearchDecoder as BeamSearchDecoder
+from hapi.text.text import MultiHeadAttention as MultiHeadAttention
+from hapi.text.text import FFN as FFN
+from hapi.text.text import TransformerEncoderLayer as TransformerEncoderLayer
+from hapi.text.text import TransformerDecoderLayer as TransformerDecoderLayer
+from hapi.text.text import TransformerEncoder as TransformerEncoder
+from hapi.text.text import TransformerDecoder as TransformerDecoder
+from hapi.text.text import TransformerBeamSearchDecoder as TransformerBeamSearchDecoder
+from hapi.text.text import DynamicGRU as DynamicGRU
+from hapi.text.text import BiGRU as BiGRU
+from hapi.text.text import Linear_chain_crf as Linear_chain_crf
+from hapi.text.text import Crf_decoding as Crf_decoding
+from hapi.text.text import SequenceTagging as SequenceTagging
--- a/hapi/text/bert/__init__.py
+++ b/hapi/text/bert/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from hapi.text.bert.bert import BertConfig as BertConfig
+from hapi.text.bert.optimization import Optimizer as Optimizer
+from hapi.text.bert.dataloader import BertDataLoader as BertDataLoader
+from hapi.text.bert.dataloader import BertInputExample as BertInputExample
+from hapi.text.tokenizer import tokenization as tokenization
+from hapi.text.bert.bert import BertEncoder as BertEncoder
--- a/hapi/text/bert/batching.py
+++ b/hapi/text/bert/batching.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mask, padding and batching."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
+    """
+    Add mask for batch_tokens, return out, mask_label, mask_pos;
+    Note: mask_pos responding the batch_tokens after padded;
+    """
+    max_len = max([len(sent) for sent in batch_tokens])
+    mask_label = []
+    mask_pos = []
+    prob_mask = np.random.rand(total_token_num)
+    # Note: the first token is [CLS], so [low=1]
+    replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
+    pre_sent_len = 0
+    prob_index = 0
+    for sent_index, sent in enumerate(batch_tokens):
+        mask_flag = False
+        prob_index += pre_sent_len
+        for token_index, token in enumerate(sent):
+            prob = prob_mask[prob_index + token_index]
+            if prob > 0.15:
+                continue
+            elif 0.03 < prob <= 0.15:
+                # mask
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = MASK
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            elif 0.015 < prob <= 0.03:
+                # random replace
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = replace_ids[prob_index + token_index]
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            else:
+                # keep the original token
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    mask_pos.append(sent_index * max_len + token_index)
+        pre_sent_len = len(sent)
+
+        # ensure at least mask one word in a sentence
+        while not mask_flag:
+            token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
+            if sent[token_index] != SEP and sent[token_index] != CLS:
+                mask_label.append(sent[token_index])
+                sent[token_index] = MASK
+                mask_flag = True
+                mask_pos.append(sent_index * max_len + token_index)
+    mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
+    mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
+    return batch_tokens, mask_label, mask_pos
+
+
+def prepare_batch_data(insts,
+                       total_token_num,
+                       voc_size=0,
+                       pad_id=None,
+                       cls_id=None,
+                       sep_id=None,
+                       mask_id=None,
+                       return_input_mask=True,
+                       return_max_len=True,
+                       return_num_token=False):
+    """
+    1. generate Tensor of data
+    2. generate Tensor of position
+    3. generate self attention mask, [shape: batch_size *  max_len * max_len]
+    """
+
+    batch_src_ids = [inst[0] for inst in insts]
+    batch_pos_ids = [inst[1] for inst in insts]
+    batch_sent_ids = [inst[2] for inst in insts]
+    labels_list = []
+    # compatible with squad, whose example includes start/end positions, 
+    # or unique id
+
+    for i in range(3, len(insts[0]), 1):
+        labels = [inst[i] for inst in insts]
+        labels = np.array(labels).astype("int64").reshape([-1, 1])
+        labels_list.append(labels)
+
+    # First step: do mask without padding
+    if mask_id >= 0:
+        out, mask_label, mask_pos = mask(
+            batch_src_ids,
+            total_token_num,
+            vocab_size=voc_size,
+            CLS=cls_id,
+            SEP=sep_id,
+            MASK=mask_id)
+    else:
+        out = batch_src_ids
+    # Second step: padding
+    src_id, self_input_mask = pad_batch_data(
+        out, pad_idx=pad_id, return_input_mask=True)
+    pos_id = pad_batch_data(
+        batch_pos_ids,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+    sent_id = pad_batch_data(
+        batch_sent_ids,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+
+    if mask_id >= 0:
+        return_list = [
+            src_id, pos_id, sent_id, self_input_mask, mask_label, mask_pos
+        ] + labels_list
+    else:
+        return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list
+
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+def pad_batch_data(insts,
+                   pad_idx=0,
+                   return_pos=False,
+                   return_input_mask=False,
+                   return_max_len=False,
+                   return_num_token=False):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and input mask.
+    """
+    return_list = []
+    max_len = max(len(inst) for inst in insts)
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+
+    inst_data = np.array([
+        list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
+    ])
+    return_list += [inst_data.astype("int64").reshape([-1, max_len])]
+
+    # position data
+    if return_pos:
+        inst_pos = np.array([
+            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
+            for inst in insts
+        ])
+
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len])]
+
+    if return_input_mask:
+        # This is used to avoid attention on paddings.
+        input_mask_data = np.array([[1] * len(inst) + [0] *
+                                    (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
+        return_list += [input_mask_data.astype("float32")]
+
+    if return_max_len:
+        return_list += [max_len]
+
+    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
+        return_list += [num_token]
+
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+if __name__ == "__main__":
+    pass
--- a/hapi/text/bert/bert.py
+++ b/hapi/text/bert/bert.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"bert"
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+import json
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, to_variable, Layer, guard
+
+from hapi.text.text import PrePostProcessLayer, TransformerEncoder
+from hapi.text.bert.utils.init import init_from_static_model
+
+
+class BertConfig(object):
+    def __init__(self, config_path):
+        self._config_dict = self._parse(config_path)
+
+    def _parse(self, config_path):
+        try:
+            with open(config_path) as json_file:
+                config_dict = json.load(json_file)
+        except Exception:
+            raise IOError("Error in parsing bert model config file '%s'" %
+                          config_path)
+        else:
+            return config_dict
+
+    def __getitem__(self, key):
+        return self._config_dict[key]
+
+    def print_config(self):
+        for arg, value in sorted(six.iteritems(self._config_dict)):
+            print('%s: %s' % (arg, value))
+        print('------------------------------------------------')
+
+
+class BertEncoder(Layer):
+    """
+    bert
+    """
+
+    def __init__(self, config, return_pooled_out=True, use_fp16=False):
+        super(BertEncoder, self).__init__()
+
+        self.config = config
+        self._emb_size = config['hidden_size']
+        self._n_layer = config['num_hidden_layers']
+        self._n_head = config['num_attention_heads']
+        self._voc_size = config['vocab_size']
+        self._max_position_seq_len = config['max_position_embeddings']
+        self._sent_types = config['type_vocab_size']
+        self._hidden_act = config['hidden_act']
+        self._prepostprocess_dropout = config['hidden_dropout_prob']
+        self._attention_dropout = config['attention_probs_dropout_prob']
+        self.return_pooled_out = return_pooled_out
+
+        self._word_emb_name = "word_embedding"
+        self._pos_emb_name = "pos_embedding"
+        self._sent_emb_name = "sent_embedding"
+        self._dtype = "float16" if use_fp16 else "float32"
+
+        self._param_initializer = fluid.initializer.TruncatedNormal(
+            scale=config['initializer_range'])
+
+        self._src_emb = Embedding(
+            size=[self._voc_size, self._emb_size],
+            param_attr=fluid.ParamAttr(
+                name=self._word_emb_name, initializer=self._param_initializer),
+            dtype=self._dtype)
+
+        self._pos_emb = Embedding(
+            size=[self._max_position_seq_len, self._emb_size],
+            param_attr=fluid.ParamAttr(
+                name=self._pos_emb_name, initializer=self._param_initializer),
+            dtype=self._dtype)
+
+        self._sent_emb = Embedding(
+            size=[self._sent_types, self._emb_size],
+            param_attr=fluid.ParamAttr(
+                name=self._sent_emb_name, initializer=self._param_initializer),
+            dtype=self._dtype)
+
+        self.pooled_fc = Linear(
+            input_dim=self._emb_size,
+            output_dim=self._emb_size,
+            param_attr=fluid.ParamAttr(
+                name="pooled_fc.w_0", initializer=self._param_initializer),
+            bias_attr="pooled_fc.b_0",
+            act="tanh")
+
+        self.pre_process_layer = PrePostProcessLayer(
+            "nd", self._emb_size, self._prepostprocess_dropout, None)
+
+        self._encoder = TransformerEncoder(
+            n_layer=self._n_layer,
+            n_head=self._n_head,
+            d_key=self._emb_size // self._n_head,
+            d_value=self._emb_size // self._n_head,
+            d_model=self._emb_size,
+            d_inner_hid=self._emb_size * 4,
+            prepostprocess_dropout=self._prepostprocess_dropout,
+            attention_dropout=self._attention_dropout,
+            relu_dropout=0,
+            preprocess_cmd="",
+            postprocess_cmd="dan",
+            ffn_fc1_act=self._hidden_act)
+
+    def init_parameters(self, param_path="", verbose=False):
+        init_from_static_model(param_path, self, self.config, verbose)
+
+    def forward(self, src_ids, position_ids, sentence_ids, input_mask):
+        """
+        forward
+        """
+        src_emb = self._src_emb(src_ids)
+        pos_emb = self._pos_emb(position_ids)
+        sent_emb = self._sent_emb(sentence_ids)
+
+        emb_out = src_emb + pos_emb
+        emb_out = emb_out + sent_emb
+
+        emb_out = self.pre_process_layer(emb_out)
+
+        self_attn_mask = fluid.layers.matmul(
+            x=input_mask, y=input_mask, transpose_y=True)
+        self_attn_mask = fluid.layers.scale(
+            x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
+        n_head_self_attn_mask = fluid.layers.stack(
+            x=[self_attn_mask] * self._n_head, axis=1)
+        n_head_self_attn_mask.stop_gradient = True
+
+        enc_output = self._encoder(emb_out, n_head_self_attn_mask)
+
+        if not self.return_pooled_out:
+            return enc_output
+        next_sent_feat = fluid.layers.slice(
+            input=enc_output, axes=[1], starts=[0], ends=[1])
+        next_sent_feat = self.pooled_fc(next_sent_feat)
+        next_sent_feat = fluid.layers.reshape(
+            next_sent_feat, shape=[-1, self._emb_size])
+
+        return enc_output, next_sent_feat
--- a/hapi/text/bert/data_processor.py
+++ b/hapi/text/bert/data_processor.py
--- a/hapi/text/bert/dataloader.py
+++ b/hapi/text/bert/dataloader.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+import os
+import six
+import csv
+import glob
+import tarfile
+import itertools
+import leveldb
+from functools import partial
+
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.fluid.io import BatchSampler, DataLoader, Dataset
+from hapi.distributed import DistributedBatchSampler
+from hapi.text.bert.data_processor import DataProcessor, XnliProcessor, ColaProcessor, MrpcProcessor, MnliProcessor
+from hapi.text.bert.batching import prepare_batch_data
+import hapi.text.tokenizer.tokenization as tokenization
+
+__all__ = [
+    'BertInputExample', 'BertInputFeatures', 'SingleSentenceDataset',
+    'SentencePairDataset', 'BertDataLoader'
+]
+
+
+class BertInputExample(object):
+    def __init__(self, uid, text_a, text_b=None, label=None):
+        self.uid = uid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+
+class BertInputFeatures(object):
+    def __init__(self, input_ids, input_mask, segment_ids, label_id):
+        self.input_ids = input_ids
+        self.pos_ids = list(range(len(self.input_ids)))
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+def convert_single_example_to_unicode(guid, single_example):
+    text_a = tokenization.convert_to_unicode(single_example[0])
+    text_b = tokenization.convert_to_unicode(single_example[1])
+    label = tokenization.convert_to_unicode(single_example[2])
+    return BertInputExample(uid=uid, text_a=text_a, text_b=text_b, label=label)
+
+
+def convert_single_example(ex_index, example, label_list, max_seq_length,
+                           tokenizer):
+    """Converts a single `BertInputExample` into a single `BertInputFeatures`."""
+    label_map = {}
+    for (i, label) in enumerate(label_list):
+        label_map[label] = i
+
+    tokens_a = tokenizer.tokenize(example.text_a)
+    tokens_b = None
+    if example.text_b:
+        tokens_b = tokenizer.tokenize(example.text_b)
+
+    if tokens_b:
+        # Modifies `tokens_a` and `tokens_b` in place so that the total
+        # length is less than the specified length.
+        # Account for [CLS], [SEP], [SEP] with "- 3"
+        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+    else:
+        # Account for [CLS] and [SEP] with "- 2"
+        if len(tokens_a) > max_seq_length - 2:
+            tokens_a = tokens_a[0:(max_seq_length - 2)]
+
+    tokens = []
+    segment_ids = []
+    tokens.append("[CLS]")
+    segment_ids.append(0)
+    for token in tokens_a:
+        tokens.append(token)
+        segment_ids.append(0)
+    tokens.append("[SEP]")
+    segment_ids.append(0)
+
+    if tokens_b:
+        for token in tokens_b:
+            tokens.append(token)
+            segment_ids.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(1)
+
+    input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+    input_mask = [1] * len(input_ids)
+    label_id = label_map[example.label]
+
+    feature = BertInputFeatures(
+        input_ids=input_ids,
+        input_mask=input_mask,
+        segment_ids=segment_ids,
+        label_id=label_id)
+
+    return feature
+
+
+def convert_examples_to_features(examples, label_list, max_seq_length,
+                                 tokenizer):
+    """Convert a set of `InputExample`s to a list of `InputFeatures`."""
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            print("Writing example %d of %d" % (ex_index, len(examples)))
+
+        feature = convert_single_example(ex_index, example, label_list,
+                                         max_seq_length, tokenizer)
+
+        features.append(feature)
+
+    return features
+
+
+def _read_tsv(input_file, delimiter="\t", quotechar=None):
+    """Reads a tab separated value file."""
+    with io.open(input_file, "r", encoding="utf8") as f:
+        reader = csv.reader(f, delimiter=delimiter, quotechar=quotechar)
+        lines = []
+        for line in reader:
+            lines.append(line)
+        return lines
+
+
+class SingleSentenceDataset(Dataset):
+    def __init__(self,
+                 tokenizer,
+                 label_list,
+                 max_seq_length,
+                 mode="all_in_memory"):
+
+        assert isinstance(mode,
+                          str), "mode of SingleSentenceDataset should be str"
+        assert mode in [
+            "all_in_memory", "leveldb", "streaming"
+        ], "mode of SingleSentenceDataset should be in [all_in_memory, leveldb, streaming], but get" % mode
+
+        self.delimiter = None
+        self.mode = mode
+        self.examples = []
+        self._db = None
+        self._line_processor = None
+
+    def load_all_data_in_memory(self,
+                                input_file,
+                                label_list,
+                                max_seq_length,
+                                tokenizer,
+                                line_processor=None,
+                                delimiter="\t",
+                                quotechar=None):
+        lines = _read_tsv(input_file, delimiter=delimiter, quotechar=quotechar)
+
+        def default_line_processor(line_id, line):
+            assert len(line) == 2
+            text_a = line[0]
+            label = line[1]
+
+            return BertInputExample(
+                str(line_id), text_a=text_a, text_b=None, label=label)
+
+        if line_processor is None:
+            line_processor = default_line_processor
+
+        for (line_id, line) in enumerate(lines):
+            input_example = line_processor(line_id, line)
+            if not input_example:
+                continue
+            input_feature = convert_single_example(
+                str(line_id), input_example, label_list, max_seq_length,
+                tokenizer)
+            self.examples.append(input_feature)
+
+    def prepare_leveldb(self,
+                        input_file,
+                        leveldb_file,
+                        label_list,
+                        max_seq_length,
+                        tokenizer,
+                        line_processor=None,
+                        delimiter="\t",
+                        quotechar=None):
+        def default_line_processor(line_id, line):
+            assert len(line) == 2
+            text_a = line[0]
+            label = line[1]
+
+            return BertInputExample(
+                str(line_id), text_a=text_a, text_b=None, label=label)
+
+        if line_processor is None:
+            line_processor = default_line_processor
+
+        if not os.path.exists(leveldb_file):
+            print("putting data %s into leveldb %s" %
+                  (input_file, leveldb_file))
+            _example_num = 0
+            _db = leveldb.LevelDB(leveldb_file, create_if_missing=True)
+            with io.open(input_file, "r", encoding="utf8") as f:
+                reader = csv.reader(
+                    f, delimiter=delimiter, quotechar=quotechar)
+                line_id = 0
+                for (_line_id, line) in enumerate(reader):
+                    if line_processor(str(_line_id), line) is None:
+                        continue
+
+                    line_str = delimiter.join(line)
+                    _db.Put(
+                        str(line_id).encode("utf8"), line_str.encode("utf8"))
+                    line_id += 1
+                    _example_num += 1
+            _db.Put("_example_num_".encode("utf8"),
+                    str(_example_num).encode("utf8"))
+        else:
+            _db = leveldb.LevelDB(leveldb_file, create_if_missing=False)
+
+        self.label_list = label_list
+        self.max_seq_length = max_seq_length
+        self.tokenizer = tokenizer
+        self.delimiter = delimiter
+        self._db = _db
+        self._line_processor = line_processor
+
+    def __getitem__(self, idx):
+
+        if self.mode == "all_in_memory":
+            return self.examples[idx].input_ids, self.examples[
+                idx].pos_ids, self.examples[idx].segment_ids, self.examples[
+                    idx].label_id
+
+        if self.mode == "leveldb":
+            assert self._db is not None, "you shold call prepare_leveldb before you run dataloader"
+            line_str = self._db.Get(str(idx).encode("utf8"))
+            line_str = line_str.decode("utf8")
+
+            line = line_str.split(self.delimiter)
+            input_example = self._line_processor(str(idx + 1), line)
+
+            input_example = convert_single_example(
+                str(idx + 1), input_example, self.label_list,
+                self.max_seq_length, self.tokenizer)
+
+            return input_example.input_ids, input_example.pos_ids, input_example.segment_ids, input_example.label_id
+
+    def __len__(self):
+        if self.mode == "all_in_memory":
+            return len(self.examples)
+
+        if self.mode == "leveldb":
+            assert self._db is not None, "you shold call prepare_leveldb before you run dataloader"
+
+            exmaple_num = self._db.Get("_example_num_".encode("utf8"))
+            exmaple_num = exmaple_num.decode("utf8")
+            return int(exmaple_num)
+
+
+class SentencePairDataset(Dataset):
+    def __init__(self,
+                 tokenizer,
+                 label_ist,
+                 max_seq_length,
+                 mode="all_in_memory"):
+
+        assert isinstance(mode,
+                          str), "mode of SentencePairDataset should be str"
+        assert mode in [
+            "all_in_memory", "leveldb"
+        ], "mode of SentencePairDataset should be in [all_in_memory, leveldb], but get" % mode
+
+        self.examples = []
+
+    def load_all_data_in_memory(self,
+                                input_file,
+                                label_list,
+                                max_seq_length,
+                                tokenizer,
+                                line_processor=None,
+                                delimiter="\t",
+                                quotechar=None):
+        lines = _read_tsv(input_file, delimiter=delimiter, quotechar=quotechar)
+
+        def default_line_processor(line_id, line):
+            assert len(line) == 3
+            text_a = line[0]
+            text_b = line[1]
+            label = line[2]
+
+            return BertInputExample(
+                str(line_id), text_a=text_a, text_b=text_b, label=label)
+
+        if line_processor is None:
+            line_processor = default_line_processor
+
+        for (line_id, line) in enumerate(lines):
+            input_example = line_processor(line_id, line)
+            if not input_example:
+                continue
+            input_feature = convert_single_example(
+                str(line_id), input_example, label_list, max_seq_length,
+                tokenizer)
+            self.examples.append(input_feature)
+
+    def __getitem__(self, idx):
+        return self.examples[idx].input_ids, self.examples[
+            idx].pos_ids, self.examples[idx].segment_ids, self.examples[
+                idx].label_id
+
+    def __len__(self):
+        return len(self.examples)
+
+
+def _prepare_train_batch(insts,
+                         vocab_size=0,
+                         pad_id=None,
+                         cls_id=None,
+                         sep_id=None,
+                         mask_id=-1,
+                         return_input_mask=True,
+                         return_max_len=True,
+                         return_num_token=False):
+
+    return prepare_batch_data(
+        insts,
+        0,
+        voc_size=vocab_size,
+        pad_id=pad_id,
+        cls_id=cls_id,
+        sep_id=sep_id,
+        mask_id=mask_id,
+        return_input_mask=return_input_mask,
+        return_max_len=return_max_len,
+        return_num_token=return_num_token)
+
+
+class BertDataLoader(object):
+    def __init__(self,
+                 input_file,
+                 tokenizer,
+                 label_list,
+                 max_seq_length,
+                 batch_size,
+                 shuffle=False,
+                 drop_last=False,
+                 mode="all_in_memory",
+                 leveldb_file="./leveldb",
+                 line_processor=None,
+                 delimiter="\t",
+                 quotechar=None,
+                 device=fluid.CPUPlace(),
+                 num_workers=0,
+                 return_list=True):
+
+        self.dataset = SingleSentenceDataset(tokenizer, label_list,
+                                             max_seq_length, mode)
+
+        if mode == "all_in_memory":
+            self.dataset.load_all_data_in_memory(
+                input_file, label_list, max_seq_length, tokenizer,
+                line_processor, delimiter, quotechar)
+        elif mode == "leveldb":
+            #prepare_leveldb(self, input_file, leveldb_file, label_list, max_seq_length, tokenizer, line_processor=None, delimiter="\t", quotechar=None):
+            self.dataset.prepare_leveldb(input_file, leveldb_file, label_list,
+                                         max_seq_length, tokenizer,
+                                         line_processor, delimiter, quotechar)
+        else:
+            raise ValueError("mode should be in [all_in_memory, leveldb]")
+
+        self.sampler = DistributedBatchSampler(
+            self.dataset, batch_size, shuffle=shuffle, drop_last=drop_last)
+
+        self.dataloader = DataLoader(
+            dataset=self.dataset,
+            batch_sampler=self.sampler,
+            places=device,
+            collate_fn=partial(
+                _prepare_train_batch,
+                vocab_size=-1,
+                pad_id=tokenizer.vocab["[PAD]"],
+                cls_id=tokenizer.vocab["[CLS]"],
+                sep_id=tokenizer.vocab["[SEP]"],
+                mask_id=-1,
+                return_input_mask=True,
+                return_max_len=False,
+                return_num_token=False),
+            num_workers=num_workers,
+            return_list=return_list)
+
+
+if __name__ == "__main__":
+    print("hello world.")
--- a/hapi/text/bert/optimization.py
+++ b/hapi/text/bert/optimization.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Optimization and learning rate scheduling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle.fluid as fluid
+
+from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
+
+
+class ConstantLR(LearningRateDecay):
+    def __init__(self, learning_rate, begin=0, step=1, dtype='float32'):
+        super(ConstantLR, self).__init__(begin, step, dtype)
+        self.learning_rate = learning_rate
+
+    def step(self):
+        return self.learning_rate
+
+
+class LinearDecay(LearningRateDecay):
+    def __init__(self,
+                 learning_rate,
+                 warmup_steps,
+                 decay_steps,
+                 end_learning_rate=0.0001,
+                 power=1.0,
+                 cycle=False,
+                 begin=0,
+                 step=1,
+                 dtype='float32'):
+        super(LinearDecay, self).__init__(begin, step, dtype)
+        self.learning_rate = learning_rate
+        self.warmup_steps = warmup_steps
+        self.decay_steps = decay_steps
+        self.end_learning_rate = end_learning_rate
+        self.power = power
+        self.cycle = cycle
+
+    def step(self):
+        if self.step_num < self.warmup_steps:
+            decayed_lr = self.learning_rate * (self.step_num /
+                                               self.warmup_steps)
+            decayed_lr = self.create_lr_var(decayed_lr)
+        else:
+            tmp_step_num = self.step_num
+            tmp_decay_steps = self.decay_steps
+            if self.cycle:
+                div_res = fluid.layers.ceil(
+                    self.create_lr_var(tmp_step_num / float(self.decay_steps)))
+                if tmp_step_num == 0:
+                    div_res = self.create_lr_var(1.0)
+                tmp_decay_steps = self.decay_steps * div_res
+            else:
+                tmp_step_num = self.create_lr_var(
+                    tmp_step_num
+                    if tmp_step_num < self.decay_steps else self.decay_steps)
+                decayed_lr = (self.learning_rate - self.end_learning_rate) * \
+                    ((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate
+
+        return decayed_lr
+
+
+class Optimizer(object):
+    def __init__(self,
+                 warmup_steps,
+                 num_train_steps,
+                 learning_rate,
+                 model_cls,
+                 weight_decay,
+                 scheduler='linear_warmup_decay',
+                 loss_scaling=1.0,
+                 parameter_list=None):
+        self.warmup_steps = warmup_steps
+        self.num_train_steps = num_train_steps
+        self.learning_rate = learning_rate
+        self.model_cls = model_cls
+        self.weight_decay = weight_decay
+        self.scheduler = scheduler
+        self.loss_scaling = loss_scaling
+        self.parameter_list = parameter_list
+
+        self.scheduled_lr = 0.0
+        self.optimizer = self.lr_schedule()
+
+    def lr_schedule(self):
+        if self.warmup_steps > 0:
+            if self.scheduler == 'noam_decay':
+                self.scheduled_lr = fluid.dygraph.NoamDecay(1 / (
+                    self.warmup_steps * (self.learning_rate**2)),
+                                                            self.warmup_steps)
+            elif self.scheduler == 'linear_warmup_decay':
+                self.scheduled_lr = LinearDecay(self.learning_rate,
+                                                self.warmup_steps,
+                                                self.num_train_steps, 0.0)
+            else:
+                raise ValueError("Unkown learning rate scheduler, should be "
+                                 "'noam_decay' or 'linear_warmup_decay'")
+            optimizer = fluid.optimizer.Adam(
+                learning_rate=self.scheduled_lr,
+                parameter_list=self.parameter_list)
+        else:
+            self.scheduled_lr = ConstantLR(self.learning_rate)
+            optimizer = fluid.optimizer.Adam(
+                learning_rate=self.scheduled_lr,
+                parameter_list=self.parameter_list)
+
+        return optimizer
+
+    def exclude_from_weight_decay(self, name):
+        if name.find("layer_norm") > -1:
+            return True
+        bias_suffix = ["_bias", "_b", ".b_0"]
+        for suffix in bias_suffix:
+            if name.endswith(suffix):
+                return True
+        return False
+
+    def minimize(self, loss, use_data_parallel=False, model=None):
+        param_list = dict()
+
+        clip_norm_thres = 1.0
+        #grad_clip = fluid.clip.GradientClipByGlobalNorm(clip_norm_thres)
+
+        if use_data_parallel:
+            loss = model.scale_loss(loss)
+
+        loss.backward()
+
+        if self.weight_decay > 0:
+            for param in self.model_cls.parameters():
+                param_list[param.name] = param * 1.0
+                param_list[param.name].stop_gradient = True
+
+        if use_data_parallel:
+            assert model is not None
+            model.apply_collective_grads()
+
+        #_, param_grads = self.optimizer.minimize(loss, grad_clip=grad_clip)
+        _, param_grads = self.optimizer.minimize(loss)
+
+        if self.weight_decay > 0:
+            for param, grad in param_grads:
+                if self.exclude_from_weight_decay(param.name):
+                    continue
+                if isinstance(self.scheduled_lr.step(), float):
+                    updated_param = param.numpy() - param_list[
+                        param.name].numpy(
+                        ) * self.weight_decay * self.scheduled_lr.step()
+                else:
+                    updated_param = param.numpy(
+                    ) - param_list[param.name].numpy(
+                    ) * self.weight_decay * self.scheduled_lr.step().numpy()
+                updated_param_var = fluid.dygraph.to_variable(updated_param)
+                param = updated_param_var
+                #param = fluid.layers.reshape(x=updated_param_var, shape=list(updated_param_var.shape))
--- a/hapi/text/bert/static_optimization.py
+++ b/hapi/text/bert/static_optimization.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Optimization and learning rate scheduling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle.fluid as fluid
+from utils.fp16 import create_master_params_grads, master_param_to_train_param, apply_dynamic_loss_scaling
+
+
+def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):
+    """ Applies linear warmup of learning rate from 0 and decay to 0."""
+    with fluid.default_main_program()._lr_schedule_guard():
+        lr = fluid.layers.tensor.create_global_var(
+            shape=[1],
+            value=0.0,
+            dtype='float32',
+            persistable=True,
+            name="scheduled_learning_rate")
+
+        global_step = fluid.layers.learning_rate_scheduler._decay_step_counter(
+        )
+
+        with fluid.layers.control_flow.Switch() as switch:
+            with switch.case(global_step < warmup_steps):
+                warmup_lr = learning_rate * (global_step / warmup_steps)
+                fluid.layers.tensor.assign(warmup_lr, lr)
+            with switch.default():
+                decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay(
+                    learning_rate=learning_rate,
+                    decay_steps=num_train_steps,
+                    end_learning_rate=0.0,
+                    power=1.0,
+                    cycle=False)
+                fluid.layers.tensor.assign(decayed_lr, lr)
+
+        return lr
+
+
+def optimization(loss,
+                 warmup_steps,
+                 num_train_steps,
+                 learning_rate,
+                 train_program,
+                 startup_prog,
+                 weight_decay,
+                 scheduler='linear_warmup_decay',
+                 use_fp16=False,
+                 use_dynamic_loss_scaling=False,
+                 init_loss_scaling=1.0,
+                 incr_every_n_steps=1000,
+                 decr_every_n_nan_or_inf=2,
+                 incr_ratio=2.0,
+                 decr_ratio=0.8):
+
+    scheduled_lr, loss_scaling = None, None
+    if scheduler == 'noam_decay':
+        if warmup_steps > 0:
+            scheduled_lr = fluid.layers.learning_rate_scheduler\
+             .noam_decay(1/(warmup_steps *(learning_rate ** 2)),
+           warmup_steps)
+        else:
+            print(
+                "WARNING: noam decay of learning rate should have postive warmup "
+                "steps but given {}, using constant learning rate instead!"
+                .format(warmup_steps))
+            scheduled_lr = fluid.layers.create_global_var(
+                name=fluid.unique_name.generate("learning_rate"),
+                shape=[1],
+                value=learning_rate,
+                dtype='float32',
+                persistable=True)
+    elif scheduler == 'linear_warmup_decay':
+        if warmup_steps > 0:
+            scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
+                                               num_train_steps)
+        else:
+            print(
+                "WARNING: linear warmup decay of learning rate should have "
+                "postive warmup steps but given {}, use constant learning rate "
+                "instead!".format(warmup_steps))
+            scheduled_lr = fluid.layers.create_global_var(
+                name=fluid.unique_name.generate("learning_rate"),
+                shape=[1],
+                value=learning_rate,
+                dtype='float32',
+                persistable=True)
+    else:
+        raise ValueError("Unkown learning rate scheduler, should be "
+                         "'noam_decay' or 'linear_warmup_decay'")
+
+    optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
+    fluid.clip.set_gradient_clip(
+        clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
+
+    def exclude_from_weight_decay(param):
+        name = param.name.rstrip(".master")
+        if name.find("layer_norm") > -1:
+            return True
+        bias_suffix = ["_bias", "_b", ".b_0"]
+        for suffix in bias_suffix:
+            if name.endswith(suffix):
+                return True
+        return False
+
+    param_list = dict()
+
+    if use_fp16:
+        loss_scaling = fluid.layers.create_global_var(
+            name=fluid.unique_name.generate("loss_scaling"),
+            shape=[1],
+            value=init_loss_scaling,
+            dtype='float32',
+            persistable=True)
+        loss *= loss_scaling
+
+        param_grads = optimizer.backward(loss)
+        master_param_grads = create_master_params_grads(
+            param_grads, train_program, startup_prog, loss_scaling)
+
+        if weight_decay > 0:
+            for param, _ in master_param_grads:
+                param_list[param.name] = param * 1.0
+                param_list[param.name].stop_gradient = True
+
+        if use_dynamic_loss_scaling:
+            apply_dynamic_loss_scaling(
+                loss_scaling, master_param_grads, incr_every_n_steps,
+                decr_every_n_nan_or_inf, incr_ratio, decr_ratio)
+
+        optimizer.apply_gradients(master_param_grads)
+
+        if weight_decay > 0:
+            for param, grad in master_param_grads:
+                if exclude_from_weight_decay(param):
+                    continue
+                with param.block.program._optimized_guard(
+                    [param, grad]), fluid.framework.name_scope("weight_decay"):
+                    updated_param = param - param_list[
+                        param.name] * weight_decay * scheduled_lr
+                    fluid.layers.assign(output=param, input=updated_param)
+
+        master_param_to_train_param(master_param_grads, param_grads,
+                                    train_program)
+
+    else:
+        if weight_decay > 0:
+            for param in train_program.all_parameters():
+                param_list[param.name] = param * 1.0
+                param_list[param.name].stop_gradient = True
+
+        _, param_grads = optimizer.minimize(loss)
+
+        if weight_decay > 0:
+            for param, grad in param_grads:
+                if exclude_from_weight_decay(param):
+                    continue
+                with param.block.program._optimized_guard(
+                    [param, grad]), fluid.framework.name_scope("weight_decay"):
+                    updated_param = param - param_list[
+                        param.name] * weight_decay * scheduled_lr
+                    fluid.layers.assign(output=param, input=updated_param)
+
+    return scheduled_lr, loss_scaling
--- a/hapi/text/bert/utils/__init__.py
+++ b/hapi/text/bert/utils/__init__.py
--- a/hapi/text/bert/utils/args.py
+++ b/hapi/text/bert/utils/args.py
--- a/hapi/text/bert/utils/cards.py
+++ b/hapi/text/bert/utils/cards.py
--- a/hapi/text/bert/utils/convert_static_to_dygraph.py
+++ b/hapi/text/bert/utils/convert_static_to_dygraph.py
--- a/hapi/text/bert/utils/fp16.py
+++ b/hapi/text/bert/utils/fp16.py
--- a/hapi/text/bert/utils/init.py
+++ b/hapi/text/bert/utils/init.py
--- a/hapi/text/text.py
+++ b/hapi/text/text.py
--- a/hapi/text/tokenizer/__init__.py
+++ b/hapi/text/tokenizer/__init__.py
--- a/hapi/text/tokenizer/tokenization.py
+++ b/hapi/text/tokenizer/tokenization.py
--- a/setup.cfg
+++ b/setup.cfg
--- a/setup.py
+++ b/setup.py
--- a/tests/test_bert_dataloader.py
+++ b/tests/test_bert_dataloader.py