!3 Add example for bert

Merge pull request !3 from c_34/master

!3 Add example for bert
Merge pull request !3 from c_34/master
62b3340c · leiyuning · Gitee · d69092d8 · c530e352 · 62b3340c
6 changed file
--- a/chapter07/Bert_NEZHA/__init__.py
+++ b/chapter07/Bert_NEZHA/__init__.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Bert Init."""
+from .bert_for_pre_training import BertNetworkWithLoss, BertPreTraining, \
+    BertPretrainingLoss, GetMaskedLMOutput, GetNextSentenceOutput, \
+    BertTrainOneStepCell, BertTrainOneStepWithLossScaleCell
+from .bert_model import BertAttention, BertConfig, BertEncoderCell, BertModel, \
+    BertOutput, BertSelfAttention, BertTransformer, EmbeddingLookup, \
+    EmbeddingPostprocessor, RelaPosEmbeddingsGenerator, RelaPosMatrixGenerator, \
+    SaturateCast, CreateAttentionMaskFromInputMask
+
+__all__ = [
+    "BertNetworkWithLoss", "BertPreTraining", "BertPretrainingLoss",
+    "GetMaskedLMOutput", "GetNextSentenceOutput", "BertTrainOneStepCell", "BertTrainOneStepWithLossScaleCell",
+    "BertAttention", "BertConfig", "BertEncoderCell", "BertModel", "BertOutput",
+    "BertSelfAttention", "BertTransformer", "EmbeddingLookup",
+    "EmbeddingPostprocessor", "RelaPosEmbeddingsGenerator",
+    "RelaPosMatrixGenerator", "SaturateCast", "CreateAttentionMaskFromInputMask"
+]
--- a/chapter07/Bert_NEZHA/bert_for_pre_training.py
+++ b/chapter07/Bert_NEZHA/bert_for_pre_training.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Bert for pretraining."""
+import numpy as np
+
+import mindspore.nn as nn
+from mindspore.common.initializer import initializer, TruncatedNormal
+from mindspore.ops import operations as P
+from mindspore.ops import functional as F
+from mindspore.ops import composite as C
+from mindspore.common.tensor import Tensor
+from mindspore.common.parameter import Parameter, ParameterTuple
+from mindspore.common import dtype as mstype
+from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
+from mindspore.train.parallel_utils import ParallelMode
+from mindspore.communication.management import get_group_size
+from mindspore import context
+from .bert_model import BertModel
+
+GRADIENT_CLIP_TYPE = 1
+GRADIENT_CLIP_VALUE = 1.0
+
+
+class ClipGradients(nn.Cell):
+    """
+    Clip gradients.
+
+    Inputs:
+        grads (tuple[Tensor]): Gradients.
+        clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'.
+        clip_value (float): Specifies how much to clip.
+
+    Outputs:
+        tuple[Tensor], clipped gradients.
+    """
+    def __init__(self):
+        super(ClipGradients, self).__init__()
+        self.clip_by_norm = nn.ClipByNorm()
+        self.cast = P.Cast()
+        self.dtype = P.DType()
+
+    def construct(self,
+                  grads,
+                  clip_type,
+                  clip_value):
+        if clip_type != 0 and clip_type != 1:
+            return grads
+
+        new_grads = ()
+        for grad in grads:
+            dt = self.dtype(grad)
+            if clip_type == 0:
+                t = C.clip_by_value(grad, self.cast(F.tuple_to_array((-clip_value,)), dt),
+                                    self.cast(F.tuple_to_array((clip_value,)), dt))
+            else:
+                t = self.clip_by_norm(grad, self.cast(F.tuple_to_array((clip_value,)), dt))
+            new_grads = new_grads + (t,)
+
+        return new_grads
+
+
+class GetMaskedLMOutput(nn.Cell):
+    """
+    Get masked lm output.
+
+    Args:
+        config (BertConfig): The config of BertModel.
+
+    Returns:
+        Tensor, masked lm output.
+    """
+    def __init__(self, config):
+        super(GetMaskedLMOutput, self).__init__()
+        self.width = config.hidden_size
+        self.reshape = P.Reshape()
+        self.gather = P.GatherV2()
+
+        weight_init = TruncatedNormal(config.initializer_range)
+        self.dense = nn.Dense(self.width,
+                              config.hidden_size,
+                              weight_init=weight_init,
+                              activation=config.hidden_act).to_float(config.compute_type)
+        self.layernorm = nn.LayerNorm(config.hidden_size).to_float(config.compute_type)
+        self.output_bias = Parameter(
+            initializer(
+                'zero',
+                config.vocab_size),
+            name='output_bias')
+        self.matmul = P.MatMul(transpose_b=True)
+        self.log_softmax = nn.LogSoftmax(axis=-1)
+        self.shape_flat_offsets = (-1, 1)
+        self.rng = Tensor(np.array(range(0, config.batch_size)).astype(np.int32))
+        self.last_idx = (-1,)
+        self.shape_flat_sequence_tensor = (config.batch_size * config.seq_length, self.width)
+        self.seq_length_tensor = Tensor(np.array((config.seq_length,)).astype(np.int32))
+        self.cast = P.Cast()
+        self.compute_type = config.compute_type
+        self.dtype = config.dtype
+
+    def construct(self,
+                  input_tensor,
+                  output_weights,
+                  positions):
+        flat_offsets = self.reshape(
+            self.rng * self.seq_length_tensor, self.shape_flat_offsets)
+        flat_position = self.reshape(positions + flat_offsets, self.last_idx)
+        flat_sequence_tensor = self.reshape(input_tensor, self.shape_flat_sequence_tensor)
+        input_tensor = self.gather(flat_sequence_tensor, flat_position, 0)
+        input_tensor = self.cast(input_tensor, self.compute_type)
+        output_weights = self.cast(output_weights, self.compute_type)
+        input_tensor = self.dense(input_tensor)
+        input_tensor = self.layernorm(input_tensor)
+        logits = self.matmul(input_tensor, output_weights)
+        logits = self.cast(logits, self.dtype)
+        logits = logits + self.output_bias
+        log_probs = self.log_softmax(logits)
+        return log_probs
+
+
+class GetNextSentenceOutput(nn.Cell):
+    """
+    Get next sentence output.
+
+    Args:
+        config (BertConfig): The config of Bert.
+
+    Returns:
+        Tensor, next sentence output.
+    """
+    def __init__(self, config):
+        super(GetNextSentenceOutput, self).__init__()
+        self.log_softmax = P.LogSoftmax()
+        self.weight_init = TruncatedNormal(config.initializer_range)
+        self.dense = nn.Dense(config.hidden_size, 2,
+                              weight_init=self.weight_init, has_bias=True).to_float(config.compute_type)
+        self.dtype = config.dtype
+        self.cast = P.Cast()
+
+    def construct(self, input_tensor):
+        logits = self.dense(input_tensor)
+        logits = self.cast(logits, self.dtype)
+        log_prob = self.log_softmax(logits)
+        return log_prob
+
+
+class BertPreTraining(nn.Cell):
+    """
+    Bert pretraining network.
+
+    Args:
+        config (BertConfig): The config of BertModel.
+        is_training (bool): Specifies whether to use the training mode.
+        use_one_hot_embeddings (bool): Specifies whether to use one-hot for embeddings.
+
+    Returns:
+        Tensor, prediction_scores, seq_relationship_score.
+    """
+    def __init__(self, config, is_training, use_one_hot_embeddings):
+        super(BertPreTraining, self).__init__()
+        self.bert = BertModel(config, is_training, use_one_hot_embeddings)
+        self.cls1 = GetMaskedLMOutput(config)
+        self.cls2 = GetNextSentenceOutput(config)
+
+    def construct(self, input_ids, input_mask, token_type_id,
+                  masked_lm_positions):
+        sequence_output, pooled_output, embedding_table = \
+            self.bert(input_ids, token_type_id, input_mask)
+        prediction_scores = self.cls1(sequence_output,
+                                      embedding_table,
+                                      masked_lm_positions)
+        seq_relationship_score = self.cls2(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class BertPretrainingLoss(nn.Cell):
+    """
+    Provide bert pre-training loss.
+
+    Args:
+        config (BertConfig): The config of BertModel.
+
+    Returns:
+        Tensor, total loss.
+    """
+    def __init__(self, config):
+        super(BertPretrainingLoss, self).__init__()
+        self.vocab_size = config.vocab_size
+        self.onehot = P.OneHot()
+        self.on_value = Tensor(1.0, mstype.float32)
+        self.off_value = Tensor(0.0, mstype.float32)
+        self.reduce_sum = P.ReduceSum()
+        self.reduce_mean = P.ReduceMean()
+        self.reshape = P.Reshape()
+        self.last_idx = (-1,)
+        self.neg = P.Neg()
+        self.cast = P.Cast()
+
+    def construct(self, prediction_scores, seq_relationship_score, masked_lm_ids,
+                  masked_lm_weights, next_sentence_labels):
+        """Defines the computation performed."""
+        label_ids = self.reshape(masked_lm_ids, self.last_idx)
+        label_weights = self.cast(self.reshape(masked_lm_weights, self.last_idx), mstype.float32)
+        one_hot_labels = self.onehot(label_ids, self.vocab_size, self.on_value, self.off_value)
+
+        per_example_loss = self.neg(self.reduce_sum(prediction_scores * one_hot_labels, self.last_idx))
+        numerator = self.reduce_sum(label_weights * per_example_loss, ())
+        denominator = self.reduce_sum(label_weights, ()) + self.cast(F.tuple_to_array((1e-5,)), mstype.float32)
+        masked_lm_loss = numerator / denominator
+
+        # next_sentence_loss
+        labels = self.reshape(next_sentence_labels, self.last_idx)
+        one_hot_labels = self.onehot(labels, 2, self.on_value, self.off_value)
+        per_example_loss = self.neg(self.reduce_sum(
+            one_hot_labels * seq_relationship_score, self.last_idx))
+        next_sentence_loss = self.reduce_mean(per_example_loss, self.last_idx)
+
+        # total_loss
+        total_loss = masked_lm_loss + next_sentence_loss
+
+        return total_loss
+
+
+class BertNetworkWithLoss(nn.Cell):
+    """
+    Provide bert pre-training loss through network.
+
+    Args:
+        config (BertConfig): The config of BertModel.
+        is_training (bool): Specifies whether to use the training mode.
+        use_one_hot_embeddings (bool): Specifies whether to use one-hot for embeddings. Default: False.
+
+    Returns:
+        Tensor, the loss of the network.
+    """
+    def __init__(self, config, is_training, use_one_hot_embeddings=False):
+        super(BertNetworkWithLoss, self).__init__()
+        self.bert = BertPreTraining(config, is_training, use_one_hot_embeddings)
+        self.loss = BertPretrainingLoss(config)
+        self.cast = P.Cast()
+
+    def construct(self,
+                  input_ids,
+                  input_mask,
+                  token_type_id,
+                  next_sentence_labels,
+                  masked_lm_positions,
+                  masked_lm_ids,
+                  masked_lm_weights):
+        prediction_scores, seq_relationship_score = \
+            self.bert(input_ids, input_mask, token_type_id, masked_lm_positions)
+        total_loss = self.loss(prediction_scores, seq_relationship_score,
+                               masked_lm_ids, masked_lm_weights, next_sentence_labels)
+        return self.cast(total_loss, mstype.float32)
+
+
+class BertTrainOneStepCell(nn.Cell):
+    """
+    Encapsulation class of bert network training.
+
+    Append an optimizer to the training network after that the construct
+    function can be called to create the backward graph.
+
+    Args:
+        network (Cell): The training network. Note that loss function should have been added.
+        optimizer (Optimizer): Optimizer for updating the weights.
+        sens (Number): The adjust parameter. Default: 1.0.
+    """
+    def __init__(self, network, optimizer, sens=1.0):
+        super(BertTrainOneStepCell, self).__init__(auto_prefix=False)
+        self.network = network
+        self.weights = ParameterTuple(network.trainable_params())
+        self.optimizer = optimizer
+        self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
+        self.sens = sens
+        self.reducer_flag = False
+        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
+        if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
+            self.reducer_flag = True
+        self.grad_reducer = None
+        if self.reducer_flag:
+            mean = context.get_auto_parallel_context("mirror_mean")
+            degree = get_group_size()
+            self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
+
+        self.clip_gradients = ClipGradients()
+        self.cast = P.Cast()
+
+    def set_sens(self, value):
+        self.sens = value
+
+    def construct(self,
+                  input_ids,
+                  input_mask,
+                  token_type_id,
+                  next_sentence_labels,
+                  masked_lm_positions,
+                  masked_lm_ids,
+                  masked_lm_weights):
+        """Defines the computation performed."""
+        weights = self.weights
+
+        loss = self.network(input_ids,
+                            input_mask,
+                            token_type_id,
+                            next_sentence_labels,
+                            masked_lm_positions,
+                            masked_lm_ids,
+                            masked_lm_weights)
+        grads = self.grad(self.network, weights)(input_ids,
+                                                 input_mask,
+                                                 token_type_id,
+                                                 next_sentence_labels,
+                                                 masked_lm_positions,
+                                                 masked_lm_ids,
+                                                 masked_lm_weights,
+                                                 self.cast(F.tuple_to_array((self.sens,)),
+                                                           mstype.float32))
+        grads = self.clip_gradients(grads, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE)
+        if self.reducer_flag:
+            # apply grad reducer on grads
+            grads = self.grad_reducer(grads)
+
+        succ = self.optimizer(grads)
+        return F.depend(loss, succ)
+
+
+grad_scale = C.MultitypeFuncGraph("grad_scale")
+reciprocal = P.Reciprocal()
+
+
+@grad_scale.register("Tensor", "Tensor")
+def tensor_grad_scale(scale, grad):
+    return grad * reciprocal(scale)
+
+
+class BertTrainOneStepWithLossScaleCell(nn.Cell):
+    """
+    Encapsulation class of bert network training.
+
+    Append an optimizer to the training network after that the construct
+    function can be called to create the backward graph.
+
+    Args:
+        network (Cell): The training network. Note that loss function should have been added.
+        optimizer (Optimizer): Optimizer for updating the weights.
+        scale_update_cell (Cell): Cell to do the loss scale. Default: None.
+    """
+    def __init__(self, network, optimizer, scale_update_cell=None):
+        super(BertTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
+        self.network = network
+        self.weights = ParameterTuple(network.trainable_params())
+        self.optimizer = optimizer
+        self.grad = C.GradOperation('grad',
+                                    get_by_list=True,
+                                    sens_param=True)
+        self.reducer_flag = False
+        self.allreduce = P.AllReduce()
+        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
+        if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
+            self.reducer_flag = True
+        self.grad_reducer = None
+        if self.reducer_flag:
+            mean = context.get_auto_parallel_context("mirror_mean")
+            degree = get_group_size()
+            self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
+        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
+        self.clip_gradients = ClipGradients()
+        self.cast = P.Cast()
+        self.alloc_status = P.NPUAllocFloatStatus()
+        self.get_status = P.NPUGetFloatStatus()
+        self.clear_before_grad = P.NPUClearFloatStatus()
+        self.reduce_sum = P.ReduceSum(keep_dims=False)
+        self.depend_parameter_use = P.ControlDepend(depend_mode=1)
+        self.base = Tensor(1, mstype.float32)
+        self.less_equal = P.LessEqual()
+        self.hyper_map = C.HyperMap()
+        self.loss_scale = None
+        self.loss_scaling_manager = scale_update_cell
+        if scale_update_cell:
+            self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32),
+                                        name="loss_scale")
+        self.add_flags(has_effect=True)
+    def construct(self,
+                  input_ids,
+                  input_mask,
+                  token_type_id,
+                  next_sentence_labels,
+                  masked_lm_positions,
+                  masked_lm_ids,
+                  masked_lm_weights,
+                  sens=None):
+        """Defines the computation performed."""
+        weights = self.weights
+        loss = self.network(input_ids,
+                            input_mask,
+                            token_type_id,
+                            next_sentence_labels,
+                            masked_lm_positions,
+                            masked_lm_ids,
+                            masked_lm_weights)
+        if sens is None:
+            scaling_sens = self.loss_scale
+        else:
+            scaling_sens = sens
+        # alloc status and clear should be right before gradoperation
+        init = self.alloc_status()
+        self.clear_before_grad(init)
+        grads = self.grad(self.network, weights)(input_ids,
+                                                 input_mask,
+                                                 token_type_id,
+                                                 next_sentence_labels,
+                                                 masked_lm_positions,
+                                                 masked_lm_ids,
+                                                 masked_lm_weights,
+                                                 self.cast(scaling_sens,
+                                                           mstype.float32))
+        grads = self.hyper_map(F.partial(grad_scale, scaling_sens), grads)
+        grads = self.clip_gradients(grads, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE)
+        if self.reducer_flag:
+            # apply grad reducer on grads
+            grads = self.grad_reducer(grads)
+        self.get_status(init)
+        flag_sum = self.reduce_sum(init, (0,))
+        if self.is_distributed:
+            # sum overflow flag over devices
+            flag_reduce = self.allreduce(flag_sum)
+            cond = self.less_equal(self.base, flag_reduce)
+        else:
+            cond = self.less_equal(self.base, flag_sum)
+        overflow = cond
+        if sens is None:
+            overflow = self.loss_scaling_manager(self.loss_scale, cond)
+        if overflow:
+            succ = False
+        else:
+            succ = self.optimizer(grads)
+        ret = (loss, cond)
+        return F.depend(ret, succ)
--- a/chapter07/Bert_NEZHA/bert_model.py
+++ b/chapter07/Bert_NEZHA/bert_model.py
--- a/chapter07/Bert_NEZHA_cnwiki/config.py
+++ b/chapter07/Bert_NEZHA_cnwiki/config.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""
+network config setting, will be used in train.py
+"""
+
+from easydict import EasyDict as edict
+import mindspore.common.dtype as mstype
+from mindspore.model_zoo.Bert_NEZHA import BertConfig
+bert_train_cfg = edict({
+    'epoch_size': 10,
+    'num_warmup_steps': 0,
+    'start_learning_rate': 1e-4,
+    'end_learning_rate': 0.0,
+    'decay_steps': 1000,
+    'power': 10.0,
+    'save_checkpoint_steps': 2000,
+    'keep_checkpoint_max': 10,
+    'checkpoint_prefix': "checkpoint_bert",
+    # please add your own dataset path
+    'DATA_DIR': "/your/path/examples.tfrecord",
+    # please add your own dataset schema path
+    'SCHEMA_DIR': "/your/path/datasetSchema.json"
+})
+bert_net_cfg = BertConfig(
+    batch_size=16,
+    seq_length=128,
+    vocab_size=21136,
+    hidden_size=1024,
+    num_hidden_layers=24,
+    num_attention_heads=16,
+    intermediate_size=4096,
+    hidden_act="gelu",
+    hidden_dropout_prob=0.0,
+    attention_probs_dropout_prob=0.0,
+    max_position_embeddings=512,
+    type_vocab_size=2,
+    initializer_range=0.02,
+    use_relative_positions=True,
+    input_mask_from_dataset=True,
+    token_type_ids_from_dataset=True,
+    dtype=mstype.float32,
+    compute_type=mstype.float16,
+)
--- a/chapter07/Bert_NEZHA_cnwiki/train.py
+++ b/chapter07/Bert_NEZHA_cnwiki/train.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""
+NEZHA (NEural contextualiZed representation for CHinese lAnguage understanding) is the Chinese pretrained language
+model currently based on BERT developed by Huawei.
+1. Prepare data
+Following the data preparation as in BERT, run command as below to get dataset for training:
+    python ./create_pretraining_data.py \
+      --input_file=./sample_text.txt \
+      --output_file=./examples.tfrecord \
+      --vocab_file=./your/path/vocab.txt \
+      --do_lower_case=True \
+      --max_seq_length=128 \
+      --max_predictions_per_seq=20 \
+      --masked_lm_prob=0.15 \
+      --random_seed=12345 \
+      --dupe_factor=5
+2. Pretrain
+First, prepare the distributed training environment, then adjust configurations in config.py, finally run train.py.
+"""
+
+import os
+import numpy as np
+from config import bert_train_cfg, bert_net_cfg
+import mindspore.dataset.engine.datasets as de
+import mindspore.dataset.transforms.c_transforms as C
+from mindspore import context
+from mindspore.common.tensor import Tensor
+from mindspore.train.model import Model
+from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor
+from mindspore.model_zoo.Bert_NEZHA import BertNetworkWithLoss, BertTrainOneStepCell
+from mindspore.nn.optim import Lamb
+_current_dir = os.path.dirname(os.path.realpath(__file__))
+
+def create_train_dataset(batch_size):
+    """create train dataset"""
+    # apply repeat operations
+    repeat_count = bert_train_cfg.epoch_size
+    ds = de.StorageDataset([bert_train_cfg.DATA_DIR], bert_train_cfg.SCHEMA_DIR,
+                           columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
+                                         "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"])
+    type_cast_op = C.TypeCast(mstype.int32)
+    ds = ds.map(input_columns="masked_lm_ids", operations=type_cast_op)
+    ds = ds.map(input_columns="masked_lm_positions", operations=type_cast_op)
+    ds = ds.map(input_columns="next_sentence_labels", operations=type_cast_op)
+    ds = ds.map(input_columns="segment_ids", operations=type_cast_op)
+    ds = ds.map(input_columns="input_mask", operations=type_cast_op)
+    ds = ds.map(input_columns="input_ids", operations=type_cast_op)
+    # apply batch operations
+    ds = ds.batch(batch_size, drop_remainder=True)
+    ds = ds.repeat(repeat_count)
+    return ds
+
+def weight_variable(shape):
+    """weight variable"""
+    np.random.seed(1)
+    ones = np.random.uniform(-0.1, 0.1, size=shape).astype(np.float32)
+    return Tensor(ones)
+
+def train_bert():
+    """train bert"""
+    context.set_context(mode=context.GRAPH_MODE)
+    context.set_context(device_target="Ascend")
+    context.set_context(enable_task_sink=True)
+    context.set_context(enable_loop_sink=True)
+    context.set_context(enable_mem_reuse=True)
+    ds = create_train_dataset(bert_net_cfg.batch_size)
+    netwithloss = BertNetworkWithLoss(bert_net_cfg, True)
+    optimizer = Lamb(netwithloss.trainable_params(), decay_steps=bert_train_cfg.decay_steps,
+                     start_learning_rate=bert_train_cfg.start_learning_rate,
+                     end_learning_rate=bert_train_cfg.end_learning_rate, power=bert_train_cfg.power,
+                     warmup_steps=bert_train_cfg.num_warmup_steps, decay_filter=lambda x: False)
+    netwithgrads = BertTrainOneStepCell(netwithloss, optimizer=optimizer)
+    netwithgrads.set_train(True)
+    model = Model(netwithgrads)
+    config_ck = CheckpointConfig(save_checkpoint_steps=bert_train_cfg.save_checkpoint_steps,
+                                 keep_checkpoint_max=bert_train_cfg.keep_checkpoint_max)
+    ckpoint_cb = ModelCheckpoint(prefix=bert_train_cfg.checkpoint_prefix, config=config_ck)
+    model.train(ds.get_repeat_count(), ds, callbacks=[LossMonitor(), ckpoint_cb], dataset_sink_mode=False)
+
+if __name__ == '__main__':
+    train_bert()
--- a/chapter07/README.md
+++ b/chapter07/README.md
+# Bert NEZHA
+`NEZHA` (**NE**ural contextuali**Z**ed representation for C**H**inese l**A**nguage understanding) is the Chinese pretrained language model currently based on BERT developed by Huawei.
+
+- `Bert_NEZHA`: Source of NEZHA model same as the one from `mindspore.model_zoo.Bert_NEZHA`
+- `Bert_NEZHA_cnwiki`: The NEZHA pretraining example using data from cnwiki.
\ No newline at end of file