Merge pull request #655 from zhanghan1992/develop

merge ernie-gram

Merge pull request #655 from zhanghan1992/develop
merge ernie-gram
cdd87dd6 · nbcc · GitHub · 1612ad07 · 24d87d8f · cdd87dd6
24 changed file
--- a/.metas/ernie_gram_framework.png
+++ b/.metas/ernie_gram_framework.png
--- a/README.en.md
+++ b/README.en.md
@@ -13,7 +13,8 @@ ERNIE 2.0 builds a strong basic for nearly every NLP tasks: Text Classification,
 # News
 - May.20.2021:
-    - ERNIE-Doc, ERNIE-Gram, [`ERNIE-ViL`](https://github.com/PaddlePaddle/ERNIE/tree/repro/ernie-vil), ERNIE-UNIMO are **avaliable** now!
+    - [`ERNIE-Doc`](https://github.com/PaddlePaddle/ERNIE/tree/repro/ernie-doc), [`ERNIE-Gram`](./ernie_gram/), [`ERNIE-ViL`](https://github.com/PaddlePaddle/ERNIE/tree/repro/ernie-vil) models are **avaliable** now!
+    - `ERNIE-UNIMO` will be released soon.
 - Dec.29.2020:
 	- Pretrain and finetune ERNIE with [PaddlePaddle v2.0](https://github.com/PaddlePaddle/Paddle/tree/release/2.0-rc).
@@ -21,7 +22,7 @@ ERNIE 2.0 builds a strong basic for nearly every NLP tasks: Text Classification,
    - Introducing `Gradient accumulation`, run `ERNIE-large` with only 8G memory.
 - Sept.24.2020:
-    - [`ERNIE-ViL`](https://github.com/PaddlePaddle/ERNIE/tree/repro/ernie-vil) is **avaliable** now!
+    - We have announced the [`ERNIE-ViL`](https://github.com/PaddlePaddle/ERNIE/tree/repro/ernie-vil)!
        - A **knowledge-enhanced** joint representations for vision-language tasks.
            - Constructing three **Scene Graph Prediction** tasks utilizing structured knowledge.
 	    - The state-of-the-art performance on 5 downstream tasks, 1st place on [VCR leaderboad](https://visualcommonsense.com/leaderboard/).

--- a/README.zh.md
+++ b/README.zh.md
@@ -12,10 +12,10 @@ ERNIE是百度开创性提出的基于知识增强的持续学习语义理解框
 - 2021.5.20:
   - ERNIE 开源家族又添生力军!
-      - 显式多粒度语言知识模型[ERNIE-Gram]()
+      - 显式多粒度语言知识模型`ERNIE-Gram` [正式开源](./ernie_gram/)
-      - 超长文本双向建模预训练模型[ERNIE-Doc]()
+      - 超长文本双向建模预训练模型`ERNIE-Doc` [正式开源](https://github.com/PaddlePaddle/ERNIE/tree/repro/ernie-doc)
-      - 融合场景图知识的跨模态预训练模型[ERNIE-ViL](https://github.com/PaddlePaddle/ERNIE/tree/repro/ernie-vil)
+      - 融合场景图知识的跨模态预训练模型`ERNIE-ViL` [正式开源](https://github.com/PaddlePaddle/ERNIE/tree/repro/ernie-vil)
-      - 语言与视觉一体的预训练模型[ERNIE-UNIMO]()
+      - 语言与视觉一体的预训练模型`ERNIE-UNIMO` 将近期开源
 - 2020.12.29:
   - `ERNIE`开源工具套件全面升级 [PaddlePaddle v2.0](https://github.com/PaddlePaddle/Paddle/tree/release/2.0-rc)
@@ -23,7 +23,7 @@ ERNIE是百度开创性提出的基于知识增强的持续学习语义理解框
   - 引入`Gradient accumulation`, 8G显存也可运行`ERNIE-large`模型。
 - 2020.9.24:
-   - `ERNIE-ViL` 模型正式开源! ([点击进入](https://github.com/PaddlePaddle/ERNIE/tree/repro/ernie-vil))
+   - `ERNIE-ViL` 技术发布! ([点击进入](https://github.com/PaddlePaddle/ERNIE/tree/repro/ernie-vil))
       - 面向视觉-语言知识增强的预训练框架，首次在视觉-语言预训练引入结构化的知识。
           - 利用场景图中的知识，构建了物体、属性和关系预测任务，精细刻画模态间细粒度语义对齐。
       - 五项视觉-语言下游任务取得最好效果，[视觉常识推理榜单](https://visualcommonsense.com/)取得第一。

--- a/ernie/modeling_ernie.py
+++ b/ernie/modeling_ernie.py
@@ -19,12 +19,13 @@ from __future__ import unicode_literals
 import json
 import logging
+import math
 import six
 if six.PY2:
    from pathlib2 import Path
 else:
    from pathlib import Path
+import numpy as np
 import paddle as P
 from paddle import nn
 from paddle.nn import functional as F
@@ -36,7 +37,35 @@ ACT_DICT = {
    'relu': nn.ReLU,
    'gelu': nn.GELU,
 }
+def _get_rel_pos_bias(seq_len, max_len=128, num_buckets=32, bidirectional=True, reset=True):
+    #max_len = 520
+    pos = np.array(range(seq_len))
+    rel_pos = pos[:, None] - pos[None, :]
+    ret = 0
+    n = -rel_pos
+    if bidirectional:
+        num_buckets //= 2
+        ret += (n < 0).astype('int32') * num_buckets  # mtf.to_int32(mtf.less(n, 0)) * num_buckets
+        n = np.abs(n)
+    else:
+        n = np.max(n, np.zeros_like(n))
+    # now n is in the range [0, inf)
+    # half of the buckets are for exact increments in positions
+    max_exact = num_buckets // 2
+    is_small = n < max_exact
+    # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+    val_if_large = max_exact + (np.log(n.astype('float32') / max_exact) / math.log(max_len / max_exact) * (num_buckets - max_exact)).astype('int32')
+    tmp = np.full_like(val_if_large, num_buckets-1)
+    val_if_large = np.where(val_if_large < tmp, val_if_large, tmp)
+    ret += np.where(is_small, n, val_if_large)
+    if reset:
+        num_buckets *= 2
+        ret[:, 0] = num_buckets
+        ret[0, :] = num_buckets // 2
+    return np.array(ret).reshape([seq_len, seq_len]).astype("int64")
 def _build_linear(n_in, n_out, name, init):
    return nn.Linear(
@@ -223,6 +252,8 @@ class PretrainedModel(object):
        'ernie-2.0-en': bce + 'model-ernie2.0-en.1.tar.gz',
        'ernie-2.0-large-en': bce + 'model-ernie2.0-large-en.1.tar.gz',
        'ernie-tiny': bce + 'model-ernie_tiny.1.tar.gz',
+        'ernie-gram-zh': bce + 'model-ernie-gram-zh.1.tar.gz',
+        'ernie-gram-en': bce + 'model-ernie-gram-en.1.tar.gz',
    }
    @classmethod
@@ -283,10 +314,14 @@ class ErnieModel(nn.Layer, PretrainedModel):
        d_vocab = cfg['vocab_size']
        d_pos = cfg['max_position_embeddings']
        d_sent = cfg.get("sent_type_vocab_size") or cfg['type_vocab_size']
+        self.d_rel_pos = cfg.get('rel_pos_size', None)
+        max_seq_len = cfg.get("max_seq_len", 512)
        self.n_head = cfg['num_attention_heads']
        self.return_additional_info = cfg.get('return_additional_info', False)
        initializer = nn.initializer.TruncatedNormal(
            std=cfg['initializer_range'])
+        if self.d_rel_pos:
+            self.rel_pos_bias = _get_rel_pos_bias(max_seq_len) 
        self.ln = _build_ln(d_model, name=append_name(name, 'pre_encoder'))
        self.word_emb = nn.Embedding(
@@ -307,6 +342,13 @@ class ErnieModel(nn.Layer, PretrainedModel):
            weight_attr=P.ParamAttr(
                name=append_name(name, 'sent_embedding'),
                initializer=initializer))
+        if self.d_rel_pos:
+            self.rel_pos_bias_emb = nn.Embedding(
+                self.d_rel_pos,
+                self.n_head,
+                weight_attr=P.ParamAttr(
+                    name=append_name(name, 'rel_pos_embedding'),
+                    initializer=initializer))
        prob = cfg['hidden_dropout_prob']
        self.dropout = nn.Dropout(p=prob)
@@ -347,6 +389,7 @@ class ErnieModel(nn.Layer, PretrainedModel):
                attn_bias=None,
                past_cache=None,
                use_causal_mask=False):
        """
        Args:
            src_ids (`Variable` of shape `[batch_size, seq_len]`):
@@ -402,15 +445,20 @@ class ErnieModel(nn.Layer, PretrainedModel):
        attn_bias = (1. - attn_bias) * -10000.0
        attn_bias = attn_bias.unsqueeze(1).tile(
            [1, self.n_head, 1, 1])  # avoid broadcast =_=
+        attn_bias.stop_gradient=True
        if sent_ids is None:
            sent_ids = P.zeros_like(src_ids)
+        if self.d_rel_pos:
+            rel_pos_ids = self.rel_pos_bias[:d_seqlen, :d_seqlen]
+            rel_pos_ids = P.to_tensor(rel_pos_ids, dtype='int64')
+            rel_pos_bias = self.rel_pos_bias_emb(rel_pos_ids).transpose([2, 0, 1])
+            attn_bias += rel_pos_bias
        src_embedded = self.word_emb(src_ids)
        pos_embedded = self.pos_emb(pos_ids)
        sent_embedded = self.sent_emb(sent_ids)
        embedded = src_embedded + pos_embedded + sent_embedded
        embedded = self.dropout(self.ln(embedded))
        encoded, hidden_list, cache_list = self.encoder_stack(

--- a/ernie/tokenizing_ernie.py
+++ b/ernie/tokenizing_ernie.py
@@ -87,6 +87,8 @@ class ErnieTokenizer(object):
        'ernie-tiny': bce + 'model-ernie_tiny.1.tar.gz',
        'ernie-gen-base-en': bce + 'model-ernie-gen-base-en.1.tar.gz',
        'ernie-gen-large-en': bce + 'model-ernie-gen-large-en.1.tar.gz',
+        'ernie-gram-zh': bce + 'model-ernie-gram-zh.1.tar.gz',
+        'ernie-gram-en': bce + 'model-ernie-gram-en.1.tar.gz',
    }
    @classmethod

--- a/ernie_gram/.meta/ernie-gram.jpeg
+++ b/ernie_gram/.meta/ernie-gram.jpeg
--- a/ernie_gram/README.en.md
+++ b/ernie_gram/README.en.md
+English|[简体中文](./README.zh.md)
+`Remind`: *ERNIE-Gram* model has been officially released in [here](#3-download-pretrained-models-optional). Our reproduction codes will be released to [repro branch](https://github.com/PaddlePaddle/ERNIE/tree/repro) soon.
+## _ERNIE-Gram_: Pre-Training with Explicitly N-Gram Masked Language Modeling for Natural Language Understanding
+![ERNIE-Gram](.meta/ernie-gram.jpeg)
+- [Framework](#ernie-gram-framework)
+- [Quick Tour](#quick-tour)
+- [Setup](#setup)
+    * [Install PaddlePaddle](#1-install-paddlepaddle)
+    * [Install ERNIE Kit](#2-install-ernie-kit)
+    * [Download pre-trained models](#3-download-pretrained-models-optional)
+    * [Download datasets](#4-download-datasets)
+- [Fine-tuning](#fine-tuning)
+- [Citation](#citation)
+### ERNIE-Gram Framework
+Since **ERNIE 1.0**, Baidu researchers have introduced **knowledge-enhanced representation learning** in pre-training to achieve better pre-training learning by masking consecutive words, phrases, named entities, and other semantic knowledge units. Furthermore, we propose **ERNIE-Gram**, an explicitly n-gram masking language model to enhance the integration of coarse-grained information for pre-training. In **ERNIE-Gram**, **n-grams** are masked and predicted directly using **explicit** n-gram identities rather than contiguous sequences of tokens.
+In downstream tasks, **ERNIE-gram** uses a `bert-style` fine-tuning approach, thus maintaining the same parameter size and computational complexity.
+We pre-train **ERNIE-Gram** on `English` and `Chinese` text corpora and fine-tune on `19` downstream tasks. Experimental results show that **ERNIE-Gram** outperforms previous pre-training models like *XLNet* and *RoBERTa* by a large margin, and achieves comparable results with state-of-the-art methods.
+The **ERNIE-Gram** paper has been accepted for **NAACL-HLT 2021**, for more details please see in [here](https://arxiv.org/abs/2010.12148).
+### Quick Tour
+```shell
+mkdir -p data
+cd data
+wget https://ernie-github.cdn.bcebos.com/data-xnli.tar.gz
+tar xf data-xnli.tar.gz
+cd ..
+#demo for NLI task
+sh ernie_gram/run_cls.sh ernie_gram/task_configs/xnli_conf
+```
+### Setup
+##### 1. Install PaddlePaddle
+This repo requires PaddlePaddle 2.0.0+, please see [here](https://www.paddlepaddle.org.cn/install/quick) for installaton instruction.
+##### 2. Install ERNIE Kit
+```shell
+git clone https://github.com/PaddlePaddle/ERNIE.git --depth 1
+cd ERNIE
+pip install -r requirements.txt
+pip install -e .
+```
+##### 3. Download pretrained models (optional)
+| Model                                              | Description                                                  |abbreviation|
+| :------------------------------------------------- | :----------------------------------------------------------- |:-----------|
+| [ERNIE-Gram Base for Chinese](https://ernie-github.cdn.bcebos.com/model-ernie-gram-zh.1.tar.gz) | Layer:12, Hidden:768, Heads:12 | ernie-gram|
+| [ERNIE-Gram Base for English](https://ernie-github.cdn.bcebos.com/model-ernie-gram-en.1.tar.gz) | Layer:12, Hidden:768, Heads:12 | ernie-gram-en |
+##### 4. Download datasets
+**English Datasets**
+Download the [GLUE datasets](https://gluebenchmark.com/tasks) by running [this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
+the `--data_dir` option in the following section assumes a directory tree like this:
+```shell
+data/xnli
+├── dev
+│   └── 1
+├── test
+│   └── 1
+└── train
+    └── 1
+```
+see [demo](https://ernie-github.cdn.bcebos.com/data-mnli-m.tar.gz) data for MNLI task.
+### Fine-tuning
+try eager execution with `dygraph model` :
+  - [Natural Language Inference](./demo/finetune_classifier_distributed.py)
+  - [Sentiment Analysis](./demo/finetune_sentiment_analysis.py)
+  - [Semantic Similarity](./demo/finetune_classifier.py)
+  - [Name Entity Recognition(NER)](./demo/finetune_ner.py)
+  - [Machine Reading Comprehension](./demo/finetune_mrc.py)
+**recomended hyper parameters:**
+ - See **ERNIE-Gram** paper [Appendix B.1-4](https://arxiv.org/abs/2010.12148)
+For full reproduction of paper results, please checkout to `repro` branch of this repo.
+# Citation
+```
+@article{xiao2020ernie,
+  title={ERNIE-Gram: Pre-Training with Explicitly N-Gram Masked Language Modeling for Natural Language Understanding},
+  author={Xiao, Dongling and Li, Yu-Kun and Zhang, Han and Sun, Yu and Tian, Hao and Wu, Hua and Wang, Haifeng},
+  journal={arXiv preprint arXiv:2010.12148},
+  year={2020}
+}
+```
+### Communication
+- [ERNIE homepage](https://wenxin.baidu.com/)
+- [Github Issues](https://github.com/PaddlePaddle/ERNIE/issues): bug reports, feature requests, install issues, usage issues, etc.
+- QQ discussion group: 760439550 (ERNIE discussion group).
+- QQ discussion group: 958422639 (ERNIE discussion group-v2).
+- [Forums](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc.
--- a/ernie_gram/README.md
+++ b/ernie_gram/README.md
+README.zh.md
\ No newline at end of file
--- a/ernie_gram/README.zh.md
+++ b/ernie_gram/README.zh.md
+[English](./README.en.md)|简体中文
+`提醒`: *ERNIE-Gram* 中/英文模型已经[正式开源](#3-下载预训练模型可选)，paper 复现代码也即将开源至 [repro分支](https://github.com/PaddlePaddle/ERNIE/tree/repro)。现在您可以使用基于 Paddle 2.0 全新升级、基于动静结合的新版 ERNIE 套件体验 *ERNIE-Gram* 中/英文开源模型。
+## _ERNIE-Gram_: Pre-Training with Explicitly N-Gram Masked Language Modeling for Natural Language Understanding
+![ERNIE-Gram](.meta/ernie-gram.jpeg)
+- [模型框架](#模型框架)
+- [快速上手](#快速上手)
+- [安装& 使用](#安装)
+  * [安装 PaddlePaddle](#1-安装-paddlepaddle)
+  * [安装 ERNIE 套件](#2-安装-ernie-套件)
+  * [下载预训练模型（可选）](#3-下载预训练模型可选)
+  * [下载任务数据集](#4-下载数据集)
+- [支持的NLP任务](#支持的-nlp-任务)
+- [文献引用](#文献引用)
+### 模型框架
+从 **ERNIE 1.0** 起，百度研究者们就在预训练中引入**知识增强**学习，通过掩码连续的词、phrase、named entity 等语义知识单元，实现更好的预训练学习。本次开源的通用语义理解模型 **ERNIE-Gram** 更进一步，提出的**显式**、**完备**的 n-gram 掩码语言模型，实现了显式的 n-gram 语义单元知识建模。
+#### ERNIE 多粒度预训练语义理解技术
+ 作为自然语言处理的基本语义单元，更充分的语言粒度学习能帮助模型实现更强的语义理解能力：
+ - **ERNIE-Gram** 提出显式完备的 **n-gram** 多粒度掩码语言模型，同步建模 n-gram **内部**和 n-gram **之间**的语义关系，实现同时学习**细粒度（fine-grained）**和**粗粒度（coarse-grained）**语义信息
+ - **ERNIE-Gram** 采用双流结构，在预训练过程中实现了单一位置多语义粒度层次预测，进一步增强了语义知识学习
+**ERNIE-Gram** 多粒度预训练语义理解技术，在**预训练 (pre-training)** 阶段实现了显式的多粒度语义信号学习，在**微调 (fine-tuning)** 阶段采用 bert-style 微调方式，在不增加参数和计算复杂度的前提下，取得 **10 项**英文权威任务的 **SOTA**。在中文任务上，**ERNIE-Gram** 在包括 NLI、阅读理解等需要丰富、多层次的语义理解任务上取得公开 **SOTA**。
+**ERNIE-Gram** 工作已被 **NAACL-HLT 2021** 作为长文收录，更多细节见 [link](https://arxiv.org/abs/2010.12148)。
+### 快速上手(待补充运行示例)
+```shell
+mkdir -p data
+cd data
+wget https://ernie-github.cdn.bcebos.com/data-xnli.tar.gz
+tar xf data-xnli.tar.gz
+cd ..
+#demo for NLI task
+sh ernie_gram/run_cls.sh ernie_gram/task_configs/xnli_conf
+```
+### 安装
+##### 1. 安装 PaddlePaddle
+本项目依赖 PaddlePaddle 2.0.0+， 请参考[这里](https://www.paddlepaddle.org.cn/install/quick)安装 PaddlePaddle。
+##### 2. 安装 ERNIE 套件
+```shell
+git clone https://github.com/PaddlePaddle/ERNIE.git --depth 1
+cd ERNIE
+pip install -r requirements.txt
+pip install -e .
+```
+`propeller`是辅助模型训练的高级框架，包含NLP常用的前、后处理流程。你可以通过将本repo根目录放入`PYTHONPATH`的方式导入`propeller`:
+```shell
+export PYTHONPATH=$PWD:$PYTHONPATH
+```
+##### 3. 下载预训练模型（可选）
+| Model                                              | 细节参数                                                                  |下载简写|
+| :------------------------------------------------- |:------------------------------------------------------------------------- |:-------|
+| [ERNIE-Gram 中文](https://ernie-github.cdn.bcebos.com/model-ernie-gram-zh.1.tar.gz)           | Layer:12, Hidden:768, Heads:12  |ernie-gram|
+| [ERNIE-Gram 英文](https://ernie-github.cdn.bcebos.com/model-ernie-gram-en.1.tar.gz)                  | Layer:3, Hdden:1024, Heads:16   |ernie-gram-en|
+##### 4. 下载数据集
+请将数据目录整理成以下格式，方便使用（通过`--data_dir`参数将数据路径传入训练脚本）；
+```shell
+data/xnli
+├── dev
+│   └── 1
+├── test
+│   └── 1
+└── train
+    └── 1
+```
+**中文数据**
+| 数据集|描述|
+|:--------|:----------|
+| [XNLI](https://ernie-github.cdn.bcebos.com/data-xnli.tar.gz)                 |XNLI 是由 Facebook 和纽约大学的研究者联合构建的自然语言推断数据集，包括 15 种语言的数据。我们用其中的中文数据来评估模型的语言理解能力。[链接](https://github.com/facebookresearch/XNLI)|
+| [ChnSentiCorp](https://ernie-github.cdn.bcebos.com/data-chnsenticorp.tar.gz) |ChnSentiCorp 是一个中文情感分析数据集，包含酒店、笔记本电脑和书籍的网购评论。|
+| [MSRA-NER](https://ernie-github.cdn.bcebos.com/data-msra_ner.tar.gz)         |MSRA-NER (SIGHAN2006) 数据集由微软亚研院发布，其目标是识别文本中具有特定意义的实体，包括人名、地名、机构名。|
+| [NLPCC2016-DBQA](https://ernie-github.cdn.bcebos.com/data-dbqa.tar.gz)       |NLPCC2016-DBQA 是由国际自然语言处理和中文计算会议 NLPCC 于 2016 年举办的评测任务，其目标是从候选中找到合适的文档作为问题的答案。[链接](http://tcci.ccf.org.cn/conference/2016/dldoc/evagline2.pdf)|
+|[CMRC2018](https://ernie-github.cdn.bcebos.com/data-cmrc2018.tar.gz)|CMRC2018 是中文信息学会举办的评测，评测的任务是抽取类阅读理解。[链接](https://github.com/ymcui/cmrc2018)
+### 支持的 NLP 任务
+使用 `动态图` 模型进行finetune:
+  - [句对分类](./demo/finetune_classifier_distributed.py)
+  - [语义匹配](./demo/finetune_classifier_distributed.py)
+  - [机器阅读理解](./demo/finetune_mrc.py)
+**推荐超参数设置：**
+|任务|batch size|learning rate|
+|--|--|--|
+| XNLI         | 256             | 1.5e-4 |
+| LCQMC        | 16              | 4e-5 |
+| DRCD         | 64              | 5e-5 |
+| CMRC2018     | 64              | 1.5e-4 |
+| DuReader     | 64              | 1.5e-5 |
+| MSRA-NER(SIGHAN2006)  | 16      | 5e-5 |
+若希望复现 paper 中的所有实验，请切换至本 repo 的 `repro` 分支。
+### 文献引用
+```
+@article{xiao2020ernie,
+  title={ERNIE-Gram: Pre-Training with Explicitly N-Gram Masked Language Modeling for Natural Language Understanding},
+  author={Xiao, Dongling and Li, Yu-Kun and Zhang, Han and Sun, Yu and Tian, Hao and Wu, Hua and Wang, Haifeng},
+  journal={arXiv preprint arXiv:2010.12148},
+  year={2020}
+}
+```
+### 讨论组
+- [ERNIE官方主页](https://wenxin.baidu.com/)
+- [Github Issues](https://github.com/PaddlePaddle/ERNIE/issues): bug reports, feature requests, install issues, usage issues, etc.
+- QQ 群: 760439550 (ERNIE discussion group).
+- QQ 2群: 958422639 (ERNIE discussion group-v2).
+- [Forums](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc.
--- a/ernie_gram/__init__.py
+++ b/ernie_gram/__init__.py
--- a/ernie_gram/finetune_classifier_distributed.py
+++ b/ernie_gram/finetune_classifier_distributed.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import time
+import logging
+import json
+import re
+from random import random
+from functools import reduce, partial
+import numpy as np
+import logging
+#from visualdl import LogWriter
+from pathlib import Path
+import paddle as P
+from propeller import log
+import propeller.paddle as propeller
+#from model.bert import BertConfig, BertModelLayer
+from ernie.modeling_ernie import ErnieModel, ErnieModelForSequenceClassification
+from ernie.tokenizing_ernie import ErnieTokenizer, ErnieTinyTokenizer
+from ernie_gram.optimization import AdamW
+from ernie_gram.utils import create_if_not_exists, get_warmup_and_linear_decay
+log.setLevel(logging.DEBUG)
+logging.getLogger().setLevel(logging.DEBUG)
+parser = propeller.ArgumentParser('classify model with ERNIE')
+parser.add_argument(
+    '--from_pretrained',
+    type=Path,
+    required=True,
+    help='pretrained model directory or tag')
+parser.add_argument(
+    '--max_seqlen',
+    type=int,
+    default=128,
+    help='max sentence length, should not greater than 512')
+parser.add_argument('--bsz', type=int, default=32, help='batchsize')
+parser.add_argument(
+    '--data_dir',
+    type=str,
+    required=True,
+    help='data directory includes train / develop data')
+parser.add_argument(
+    '--max_steps',
+    type=int,
+    required=True,
+    help='max_train_steps, set this to EPOCH * NUM_SAMPLES / BATCH_SIZE')
+parser.add_argument('--warmup_proportion', type=float, default=0.1)
+parser.add_argument('--lr', type=float, default=5e-5, help='learning rate')
+parser.add_argument('--lr_decay', type=float, default=0.8, help='layerwise learning decay rate')
+parser.add_argument('--decay_layers', type=float, default=12, help='number of layers for layerwise learning decay')
+parser.add_argument('--label_map', type=str, default="", help='str to int')
+parser.add_argument('--num_labels', type=int, default=2, help='number of labels')
+parser.add_argument('--valid_steps', type=int, default=100, help='The steps interval to evaluate model performance.')
+parser.add_argument('--pair_input', type=int, default=0, help='is sentence pair task or not')
+parser.add_argument(
+    '--save_dir', type=Path, required=True, help='model output directory')
+parser.add_argument(
+    '--wd', type=float, default=0.01, help='weight decay, aka L2 regularizer')
+parser.add_argument(
+    '--init_checkpoint',
+    type=str,
+    default=None,
+    help='checkpoint to warm start from')
+parser.add_argument(
+    '--use_amp',
+    action='store_true',
+    help='only activate AMP(auto mixed precision accelatoin) on TensorCore compatible devices'
+)
+args = parser.parse_args()
+env = P.distributed.ParallelEnv()
+tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained)
+#tokenizer = ErnieTinyTokenizer.from_pretrained(args.from_pretrained)
+if args.label_map:
+    label_map = {k.encode(): v for k, v in json.loads(args.label_map).items()}
+else:
+    label_map = {str(l).encode(): l for l in range(args.num_labels)}
+text_col_names = ["seg_a", "seg_b"] if args.pair_input else ["seg_a"]
+feature_column = propeller.data.FeatureColumns([
+    propeller.data.TextColumn(
+        col_name,
+        unk_id=tokenizer.unk_id,
+        vocab_dict=tokenizer.vocab,
+        tokenizer=tokenizer.tokenize) for col_name in text_col_names] + [
+    propeller.data.LabelColumn(
+        'label',
+        vocab_dict=label_map),
+])
+def map_fn_pair(seg_a, seg_b, label):
+    seg_a, seg_b = tokenizer.truncate(seg_a, seg_b, seqlen=args.max_seqlen)
+    sentence, segments = tokenizer.build_for_ernie(seg_a, seg_b)
+    return sentence, segments, label
+def map_fn_single(seg_a, label):
+    seg_a, _ = tokenizer.truncate(seg_a, [], seqlen=args.max_seqlen)
+    sentence, segments = tokenizer.build_for_ernie(seg_a, [])
+    return sentence, segments, label
+map_fn = map_fn_pair if args.pair_input else map_fn_single
+train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'),
+                                            shuffle=True, repeat=True, use_gz=False, shard=True) \
+                               .map(map_fn) \
+                               .padded_batch(args.bsz, (0, 0, 0))
+dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'),
+                                        shuffle=False, repeat=False, use_gz=False) \
+                               .map(map_fn) \
+                               .padded_batch(args.bsz, (0, 0, 0))
+test_ds = feature_column.build_dataset('test', data_dir=os.path.join(args.data_dir, 'test'),
+                                        shuffle=False, repeat=False, use_gz=False) \
+                               .map(map_fn) \
+                               .padded_batch(args.bsz, (0, 0, 0))
+shapes = ([-1, args.max_seqlen], [-1, args.max_seqlen], [-1])
+types = ('int64', 'int64', 'int64')
+P.distributed.init_parallel_env()
+model = ErnieModelForSequenceClassification.from_pretrained(
+    args.from_pretrained, num_labels=args.num_labels, name='')
+if args.init_checkpoint is not None:
+    log.info('loading checkpoint from %s' % args.init_checkpoint)
+    sd = P.load(args.init_checkpoint)
+    model.set_state_dict(sd)
+model = P.DataParallel(model)
+g_clip = P.nn.ClipGradByGlobalNorm(1.0)  #experimental
+param_name_to_exclue_from_weight_decay = re.compile(
+    r'.*layer_norm_scale|.*layer_norm_bias|.*b_0')
+lr_scheduler = P.optimizer.lr.LambdaDecay(
+    args.lr,
+    get_warmup_and_linear_decay(args.max_steps,
+                                int(args.warmup_proportion * args.max_steps)))
+opt = AdamW(
+    learning_rate=lr_scheduler,
+    parameters=model.parameters(),
+    apply_decay_param_fun=lambda n: not param_name_to_exclue_from_weight_decay.match(n),
+    weight_decay=args.wd,
+    grad_clip=g_clip,
+    layerwise_lr_decay_rate=args.lr_decay,
+    n_layers=args.decay_layers)
+scaler = P.amp.GradScaler(enable=args.use_amp)
+step = 0
+create_if_not_exists(args.save_dir)
+#with LogWriter(logdir=str(create_if_not_exists(args.save_dir / 'vdl-%d' % env.dev_id))) as log_writer:
+with P.amp.auto_cast(enable=args.use_amp):
+    for ids, sids, label in P.io.DataLoader(
+            train_ds, places=P.CUDAPlace(env.dev_id), batch_size=None):
+        step += 1
+        loss, _ = model(ids, sids, labels=label)
+        loss = scaler.scale(loss)
+        loss.backward()
+        scaler.minimize(opt, loss)
+        model.clear_gradients()
+        lr_scheduler.step()
+        # do logging
+        if step % 10 == 0:
+            _lr = lr_scheduler.get_lr()
+            if args.use_amp:
+                _l = (loss / scaler._scale).numpy()
+                msg = '[rank-%d][step-%d] train loss %.5f lr %.3e scaling %.3e' % (
+                    env.dev_id, step, _l, _lr, scaler._scale.numpy())
+            else:
+                _l = loss.numpy()
+                msg = '[rank-%d][step-%d] train loss %.5f lr %.3e' % (
+                    env.dev_id, step, _l, _lr)
+            log.debug(msg)
+            #log_writer.add_scalar('loss', _l, step=step)
+            #log_writer.add_scalar('lr', _lr, step=step)
+        # do saving
+        if step % args.valid_steps == 0 and env.dev_id == 0:
+            acc = []
+            with P.no_grad():
+                model.eval()
+                for d in P.io.DataLoader(
+                        dev_ds, places=P.CUDAPlace(env.dev_id),
+                        batch_size=None):
+                    ids, sids, label = d
+                    loss, logits = model(ids, sids, labels=label)
+                    a = (logits.argmax(-1) == label)
+                    acc.append(a.numpy())
+                model.train()
+            acc = np.concatenate(acc).mean()
+            #log_writer.add_scalar('eval/acc', acc, step=step)
+            log.debug('dev acc %.5f' % acc)
+            acc = []
+            with P.no_grad():
+                model.eval()
+                for d in P.io.DataLoader(
+                        test_ds, places=P.CUDAPlace(env.dev_id),
+                        batch_size=None):
+                    ids, sids, label = d
+                    loss, logits = model(ids, sids, labels=label)
+                    a = (logits.argmax(-1) == label)
+                    acc.append(a.numpy())
+                model.train()
+            acc = np.concatenate(acc).mean()
+            #log_writer.add_scalar('eval/acc', acc, step=step)
+            log.debug('test acc %.5f' % acc)
+            if args.save_dir is not None:
+                P.save(model.state_dict(), args.save_dir / 'ckpt.bin')
+        # exit 
+        if step > args.max_steps:
+            break
+if args.save_dir is not None and env.dev_id == 0:
+    P.save(model.state_dict(), args.save_dir / 'ckpt.bin')
+log.debug('done')
--- a/ernie_gram/finetune_mrc.py
+++ b/ernie_gram/finetune_mrc.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+import os
+import re
+import time
+import logging
+import json
+from pathlib import Path
+from random import random
+from tqdm import tqdm
+from functools import reduce, partial
+import pickle
+import argparse
+from functools import partial
+from io import open
+import numpy as np
+import logging
+import paddle as P
+from propeller import log
+import propeller.paddle as propeller
+from ernie_gram.optimization import AdamW
+from ernie.modeling_ernie import ErnieModel, ErnieModelForQuestionAnswering
+from ernie.tokenizing_ernie import ErnieTokenizer, ErnieTinyTokenizer
+#from ernie.optimization import AdamW, LinearDecay
+from ernie_gram.mrc import mrc_reader
+from ernie_gram.mrc import mrc_metrics
+from ernie_gram.utils import create_if_not_exists, get_warmup_and_linear_decay
+log.setLevel(logging.DEBUG)
+logging.getLogger().setLevel(logging.DEBUG)
+def evaluate(model, ds, all_examples, all_features, tokenizer, args):
+    dev_file = json.loads(open(args.dev_file, encoding='utf8').read())
+    with P.no_grad():
+        log.debug('start eval')
+        model.eval()
+        all_res = []
+        for step, (uids, token_ids, token_type_ids, _, __) in enumerate(
+                P.io.DataLoader(
+                    ds, places=P.CUDAPlace(env.dev_id), batch_size=None)):
+            _, start_logits, end_logits = model(token_ids, token_type_ids)
+            res = [
+                mrc_metrics.RawResult(
+                    unique_id=u, start_logits=s, end_logits=e)
+                for u, s, e in zip(uids.numpy(),
+                                   start_logits.numpy(), end_logits.numpy())
+            ]
+            all_res += res
+        open('all_res', 'wb').write(pickle.dumps(all_res))
+        all_pred, all_nbests = mrc_metrics.make_results(
+            tokenizer,
+            all_examples,
+            all_features,
+            all_res,
+            n_best_size=args.n_best_size,
+            max_answer_length=args.max_answer_length,
+            do_lower_case=tokenizer.lower)
+        f1, em, _, __ = mrc_metrics.evaluate(dev_file, all_pred)
+        model.train()
+        log.debug('done eval')
+        return f1, em
+def train(model, train_dataset, dev_dataset, dev_examples, dev_features,
+          tokenizer, args):
+    model = P.DataParallel(model)
+    max_steps = args.max_steps
+    g_clip = P.nn.ClipGradByGlobalNorm(1.0)  #experimental
+    lr_scheduler = P.optimizer.lr.LambdaDecay(
+        args.lr,
+        get_warmup_and_linear_decay(max_steps,
+                                    int(args.warmup_proportion * max_steps)))
+    opt = AdamW(
+        lr_scheduler,
+        parameters=model.parameters(),
+        weight_decay=args.wd,
+        grad_clip=g_clip)
+    train_dataset = train_dataset \
+            .cache_shuffle_shard(env.nranks, env.dev_id, drop_last=True) \
+            .padded_batch(args.bsz)
+    log.debug('init training with args: %s' % repr(args))
+    scaler = P.amp.GradScaler(enable=args.use_amp)
+    create_if_not_exists(args.save_dir)
+    with P.amp.auto_cast(enable=args.use_amp):
+        for step, (_, token_ids, token_type_ids, start_pos,
+                   end_pos) in enumerate(
+                       P.io.DataLoader(
+                           train_dataset,
+                           places=P.CUDAPlace(env.dev_id),
+                           batch_size=None)):
+            loss, _, __ = model(
+                token_ids,
+                token_type_ids,
+                start_pos=start_pos,
+                end_pos=end_pos)
+            loss = scaler.scale(loss)
+            loss.backward()
+            scaler.minimize(opt, loss)
+            model.clear_gradients()
+            lr_scheduler.step()
+            if env.dev_id == 0 and step % 10==0 and step:
+                _lr = lr_scheduler.get_lr()
+                if args.use_amp:
+                    _l = (loss / scaler._scale).numpy()
+                    msg = '[rank-%d][step-%d] train loss %.5f lr %.3e scaling %.3e' % (
+                        env.dev_id, step, _l, _lr, scaler._scale.numpy())
+                else:
+                    _l = loss.numpy()
+                    msg = '[rank-%d][step-%d] train loss %.5f lr %.3e' % (
+                        env.dev_id, step, _l, _lr)
+                log.debug(msg)
+            if env.dev_id == 0 and step % 100==0 and step:
+                print(step)
+                f1, em = evaluate(model, dev_dataset, dev_examples,
+                                  dev_features, tokenizer, args)
+                log.debug('[step %d] eval result: f1 %.5f em %.5f' %
+                          (step, f1, em))
+                if env.dev_id == 0 and args.save_dir is not None:
+                    P.save(model.state_dict(), args.save_dir / 'ckpt.bin')
+            if step > max_steps:
+                break
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser('MRC model with ERNIE')
+    parser.add_argument(
+        '--from_pretrained',
+        type=Path,
+        required=True,
+        help='pretrained model directory or tag')
+    parser.add_argument(
+        '--max_seqlen',
+        type=int,
+        default=512,
+        help='max sentence length, should not greater than 512')
+    parser.add_argument('--bsz', type=int, default=16, help='batchsize')
+    parser.add_argument('--max_steps', type=int, required=True, help='max steps')
+    parser.add_argument(
+        '--train_file',
+        type=str,
+        required=True,
+        help='data directory includes train / develop data')
+    parser.add_argument(
+        '--dev_file',
+        type=str,
+        required=True,
+        help='data directory includes train / develop data')
+    parser.add_argument('--warmup_proportion', type=float, default=0.0)
+    parser.add_argument('--lr', type=float, default=3e-5, help='learning rate')
+    parser.add_argument(
+        '--save_dir', type=Path, required=True, help='model output directory')
+    parser.add_argument(
+        '--n_best_size', type=int, default=20, help='nbest prediction to keep')
+    parser.add_argument(
+        '--max_answer_length', type=int, default=100, help='max answer span')
+    parser.add_argument(
+        '--wd',
+        type=float,
+        default=0.01,
+        help='weight decay, aka L2 regularizer')
+    parser.add_argument(
+        '--use_amp',
+        action='store_true',
+        help='only activate AMP(auto mixed precision accelatoin) on TensorCore compatible devices'
+    )
+    args = parser.parse_args()
+    env = P.distributed.ParallelEnv()
+    P.distributed.init_parallel_env()
+    tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained)
+    if not os.path.exists(args.train_file):
+        raise RuntimeError('input data not found at %s' % args.train_file)
+    if not os.path.exists(args.dev_file):
+        raise RuntimeError('input data not found at %s' % args.dev_file)
+    log.info('making train/dev data...')
+    train_examples = mrc_reader.read_files(args.train_file, is_training=True)
+    train_features = mrc_reader.convert_example_to_features(
+        train_examples, args.max_seqlen, tokenizer, is_training=True)
+    dev_examples = mrc_reader.read_files(args.dev_file, is_training=False)
+    dev_features = mrc_reader.convert_example_to_features(
+        dev_examples, args.max_seqlen, tokenizer, is_training=False)
+    log.info('train examples: %d, features: %d' %
+             (len(train_examples), len(train_features)))
+    def map_fn(unique_id, example_index, doc_span_index, tokens,
+               token_to_orig_map, token_is_max_context, token_ids,
+               position_ids, text_type_ids, start_position, end_position):
+        if start_position is None:
+            start_position = 0
+        if end_position is None:
+            end_position = 0
+        return np.array(unique_id), np.array(token_ids), np.array(
+            text_type_ids), np.array(start_position), np.array(end_position)
+    train_dataset = propeller.data.Dataset.from_list(train_features).map(
+        map_fn)
+    dev_dataset = propeller.data.Dataset.from_list(dev_features).map(
+        map_fn).padded_batch(args.bsz)
+    model = ErnieModelForQuestionAnswering.from_pretrained(
+        args.from_pretrained, name='')
+    train(model, train_dataset, dev_dataset, dev_examples, dev_features,
+          tokenizer, args)
+    if env.dev_id == 0:
+        f1, em = evaluate(model, dev_dataset, dev_examples, dev_features,
+                          tokenizer, args)
+        log.debug('final eval result: f1 %.5f em %.5f' % (f1, em))
+    if env.dev_id == 0 and args.save_dir is not None:
+        P.save(model.state_dict(), args.save_dir / 'ckpt.bin')
--- a/ernie_gram/finetune_ner.py
+++ b/ernie_gram/finetune_ner.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+import time
+import logging
+import six
+import json
+from random import random
+from tqdm import tqdm
+from collections import OrderedDict
+from functools import reduce, partial
+from pathlib import Path
+from visualdl import LogWriter
+import numpy as np
+import multiprocessing
+import pickle
+import logging
+from sklearn.metrics import f1_score
+import paddle as P
+from propeller import log
+import propeller.paddle as propeller
+log.setLevel(logging.DEBUG)
+logging.getLogger().setLevel(logging.DEBUG)
+from ernie_gram.utils import create_if_not_exists, get_warmup_and_linear_decay
+from ernie.modeling_ernie import ErnieModel, ErnieModelForSequenceClassification, ErnieModelForTokenClassification
+from ernie.tokenizing_ernie import ErnieTokenizer
+from ernie_gram.optimization import AdamW
+parser = propeller.ArgumentParser('NER model with ERNIE')
+parser.add_argument('--max_seqlen', type=int, default=256)
+parser.add_argument('--bsz', type=int, default=16)
+parser.add_argument('--data_dir', type=str, required=True)
+parser.add_argument('--epoch', type=int, default=10)
+parser.add_argument(
+    '--warmup_proportion',
+    type=float,
+    default=0.1,
+    help='if use_lr_decay is set, '
+    'learning rate will raise to `lr` at `warmup_proportion` * `max_steps` and decay to 0. at `max_steps`'
+)
+parser.add_argument(
+    '--max_steps',
+    type=int,
+    required=True,
+    help='max_train_steps, set this to EPOCH * NUM_SAMPLES / BATCH_SIZE, used in learning rate scheduler'
+)
+parser.add_argument(
+    '--use_amp',
+    action='store_true',
+    help='only activate AMP(auto mixed precision accelatoin) on TensorCore compatible devices'
+)
+parser.add_argument('--from_pretrained', type=Path, required=True)
+parser.add_argument('--lr', type=float, default=5e-5, help='learning rate')
+parser.add_argument(
+    '--save_dir', type=Path, required=True, help='model output directory')
+parser.add_argument(
+    '--wd', type=float, default=0.01, help='weight decay, aka L2 regularizer')
+args = parser.parse_args()
+tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained)
+def tokenizer_func(inputs):
+    ret = inputs.split(b'\2')
+    tokens, orig_pos = [], []
+    for i, r in enumerate(ret):
+        t = tokenizer.tokenize(r)
+        for tt in t:
+            tokens.append(tt)
+            orig_pos.append(i)
+    assert len(tokens) == len(orig_pos)
+    return tokens + orig_pos
+def tokenizer_func_for_label(inputs):
+    return inputs.split(b'\2')
+feature_map = {
+    b"B-PER": 0,
+    b"I-PER": 1,
+    b"B-ORG": 2,
+    b"I-ORG": 3,
+    b"B-LOC": 4,
+    b"I-LOC": 5,
+    b"O": 6,
+}
+other_tag_id = feature_map[b'O']
+feature_column = propeller.data.FeatureColumns([
+    propeller.data.TextColumn(
+        'text_a',
+        unk_id=tokenizer.unk_id,
+        vocab_dict=tokenizer.vocab,
+        tokenizer=tokenizer_func), propeller.data.TextColumn(
+            'label',
+            unk_id=other_tag_id,
+            vocab_dict=feature_map,
+            tokenizer=tokenizer_func_for_label, )
+])
+def before(seg, label):
+    seg, orig_pos = np.split(seg, 2)
+    aligned_label = label[orig_pos]
+    seg, _ = tokenizer.truncate(seg, [], args.max_seqlen)
+    aligned_label, _ = tokenizer.truncate(aligned_label, [], args.max_seqlen)
+    orig_pos, _ = tokenizer.truncate(orig_pos, [], args.max_seqlen)
+    sentence, segments = tokenizer.build_for_ernie(
+        seg
+    )  #utils.data.build_1_pair(seg, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id)
+    aligned_label = np.concatenate([[0], aligned_label, [0]], 0)
+    orig_pos = np.concatenate([[0], orig_pos, [0]])
+    assert len(aligned_label) == len(sentence) == len(orig_pos), (
+        len(aligned_label), len(sentence), len(orig_pos))  # alinged
+    return sentence, segments, aligned_label, label, orig_pos
+train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=False, use_gz=False) \
+                               .map(before) \
+                               .padded_batch(args.bsz, (0,0,-100, other_tag_id + 1, 0)) \
+dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \
+                               .map(before) \
+                               .padded_batch(args.bsz, (0,0,-100, other_tag_id + 1,0)) \
+test_ds = feature_column.build_dataset('test', data_dir=os.path.join(args.data_dir, 'test'), shuffle=False, repeat=False, use_gz=False) \
+                               .map(before) \
+                               .padded_batch(args.bsz, (0,0,-100, other_tag_id + 1,0)) \
+def evaluate(model, dataset):
+    model.eval()
+    with P.no_grad():
+        chunkf1 = propeller.metrics.ChunkF1(None, None, None, len(feature_map))
+        for step, (ids, sids, aligned_label, label, orig_pos
+                   ) in enumerate(P.io.DataLoader(
+                       dataset, batch_size=None)):
+            loss, logits = model(ids, sids)
+            #print('\n'.join(map(str, logits.numpy().tolist())))
+            assert orig_pos.shape[0] == logits.shape[0] == ids.shape[
+                0] == label.shape[0]
+            for pos, lo, la, id in zip(orig_pos.numpy(),
+                                       logits.numpy(),
+                                       label.numpy(), ids.numpy()):
+                _dic = OrderedDict()
+                assert len(pos) == len(lo) == len(id)
+                for _pos, _lo, _id in zip(pos, lo, id):
+                    if _id > tokenizer.mask_id:  # [MASK] is the largest special token
+                        _dic.setdefault(_pos, []).append(_lo)
+                merged_lo = np.array(
+                    [np.array(l).mean(0) for _, l in six.iteritems(_dic)])
+                merged_preds = np.argmax(merged_lo, -1)
+                la = la[np.where(la != (other_tag_id + 1))]  #remove pad
+                if len(la) > len(merged_preds):
+                    log.warn(
+                        'accuracy loss due to truncation: label len:%d, truncate to %d'
+                        % (len(la), len(merged_preds)))
+                    merged_preds = np.pad(merged_preds,
+                                          [0, len(la) - len(merged_preds)],
+                                          mode='constant',
+                                          constant_values=7)
+                else:
+                    assert len(la) == len(
+                        merged_preds
+                    ), 'expect label == prediction, got %d vs %d' % (
+                        la.shape, merged_preds.shape)
+                chunkf1.update((merged_preds, la, np.array(len(la))))
+        #f1 = f1_score(np.concatenate(all_label), np.concatenate(all_pred), average='macro')
+        f1 = chunkf1.eval()
+    model.train()
+    return f1
+model = ErnieModelForTokenClassification.from_pretrained(
+    args.from_pretrained,
+    num_labels=len(feature_map),
+    name='',
+    has_pooler=False)
+g_clip = P.nn.ClipGradByGlobalNorm(1.0)  #experimental
+param_name_to_exclue_from_weight_decay = re.compile(
+    r'.*layer_norm_scale|.*layer_norm_bias|.*b_0')
+lr_scheduler = P.optimizer.lr.LambdaDecay(
+    args.lr,
+    get_warmup_and_linear_decay(args.max_steps,
+                                int(args.warmup_proportion * args.max_steps)))
+opt = AdamW(
+    lr_scheduler,
+    parameters=model.parameters(),
+    weight_decay=args.wd,
+    apply_decay_param_fun=lambda n: not param_name_to_exclue_from_weight_decay.match(n),
+    grad_clip=g_clip)
+scaler = P.amp.GradScaler(enable=args.use_amp)
+with LogWriter(
+        logdir=str(create_if_not_exists(args.save_dir / 'vdl'))) as log_writer:
+    with P.amp.auto_cast(enable=args.use_amp):
+        for epoch in range(args.epoch):
+            for step, (
+                    ids, sids, aligned_label, label, orig_pos
+            ) in enumerate(P.io.DataLoader(
+                    train_ds, batch_size=None)):
+                loss, logits = model(ids, sids, labels=aligned_label)
+                #loss, logits = model(ids, sids, labels=aligned_label, loss_weights=P.cast(ids != 0, 'float32'))
+                loss = scaler.scale(loss)
+                loss.backward()
+                scaler.minimize(opt, loss)
+                model.clear_gradients()
+                lr_scheduler.step()
+                if step % 10 == 0:
+                    _lr = lr_scheduler.get_lr()
+                    if args.use_amp:
+                        _l = (loss / scaler._scale).numpy()
+                        msg = '[step-%d] train loss %.5f lr %.3e scaling %.3e' % (
+                            step, _l, _lr, scaler._scale.numpy())
+                    else:
+                        _l = loss.numpy()
+                        msg = '[step-%d] train loss %.5f lr %.3e' % (step, _l,
+                                                                     _lr)
+                    log.debug(msg)
+                    log_writer.add_scalar('loss', _l, step=step)
+                    log_writer.add_scalar('lr', _lr, step=step)
+                if step % 100 == 0:
+                    f1 = evaluate(model, dev_ds)
+                    log.debug('dev eval f1: %.5f' % f1)
+                    log_writer.add_scalar('dev eval/f1', f1, step=step)
+                    f1 = evaluate(model, test_ds)
+                    log.debug('test eval f1: %.5f' % f1)
+                    log_writer.add_scalar('test eval/f1', f1, step=step)
+                    if args.save_dir is not None:
+                        P.save(model.state_dict(), args.save_dir / 'ckpt.bin')
+f1 = evaluate(model, dev_ds)
+log.debug('final eval f1: %.5f' % f1)
+log_writer.add_scalar('eval/f1', f1, step=step)
+if args.save_dir is not None:
+    P.save(model.state_dict(), args.save_dir / 'ckpt.bin')
--- a/ernie_gram/mrc/__init__.py
+++ b/ernie_gram/mrc/__init__.py
--- a/ernie_gram/mrc/mrc_metrics.py
+++ b/ernie_gram/mrc/mrc_metrics.py
--- a/ernie_gram/mrc/mrc_reader.py
+++ b/ernie_gram/mrc/mrc_reader.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+import sys
+import argparse
+import logging
+from functools import partial
+from io import open
+open = partial(open, encoding='utf-8')
+import json
+from collections import namedtuple
+log = logging.getLogger(__name__)
+Example = namedtuple('Example', [
+    'qas_id', 'question_text', 'doc_tokens', 'orig_answer_text',
+    'start_position', 'end_position'
+])
+Feature = namedtuple("Feature", [
+    "unique_id", "example_index", "doc_span_index", "tokens",
+    "token_to_orig_map", "token_is_max_context", "token_ids", "position_ids",
+    "text_type_ids", "start_position", "end_position"
+])
+def _tokenize_chinese_chars(text):
+    """Adds whitespace around any CJK character."""
+    def _is_chinese_char(cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+        return False
+    output = []
+    buff = ""
+    for char in text:
+        cp = ord(char)
+        if _is_chinese_char(cp):
+            if buff != "":
+                output.append(buff)
+                buff = ""
+            output.append(char)
+        else:
+            buff += char
+    if buff != "":
+        output.append(buff)
+    return output
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """chech is max context"""
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context,
+                    num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+    return cur_span_index == best_span_index
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
+                         orig_answer_text):
+    """improve answer span"""
+    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+    return (input_start, input_end)
+def read_files(input_file, is_training):
+    """read file"""
+    examples = []
+    with open(input_file, "r") as f:
+        input_data = json.load(f)["data"]
+        for entry in input_data:
+            for paragraph in entry["paragraphs"]:
+                paragraph_text = paragraph["context"]
+                for qa in paragraph["qas"]:
+                    qas_id = qa["id"]
+                    question_text = qa["question"]
+                    start_pos = None
+                    end_pos = None
+                    orig_answer_text = None
+                    if is_training:
+                        if len(qa["answers"]) != 1:
+                            raise ValueError(
+                                "For training, each question should have exactly 1 answer."
+                            )
+                        answer = qa["answers"][0]
+                        orig_answer_text = answer["text"]
+                        answer_offset = answer["answer_start"]
+                        answer_length = len(orig_answer_text)
+                        doc_tokens = [
+                            paragraph_text[:answer_offset], paragraph_text[
+                                answer_offset:answer_offset + answer_length],
+                            paragraph_text[answer_offset + answer_length:]
+                        ]
+                        start_pos = 1
+                        end_pos = 1
+                        actual_text = " ".join(doc_tokens[start_pos:(end_pos +
+                                                                     1)])
+                        if actual_text.find(orig_answer_text) == -1:
+                            log.info("Could not find answer: '%s' vs. '%s'",
+                                     actual_text, orig_answer_text)
+                            continue
+                    else:
+                        doc_tokens = _tokenize_chinese_chars(paragraph_text)
+                    example = Example(
+                        qas_id=qas_id,
+                        question_text=question_text,
+                        doc_tokens=doc_tokens,
+                        orig_answer_text=orig_answer_text,
+                        start_position=start_pos,
+                        end_position=end_pos)
+                    examples.append(example)
+    return examples
+def convert_example_to_features(examples,
+                                max_seq_length,
+                                tokenizer,
+                                is_training,
+                                doc_stride=128,
+                                max_query_length=64):
+    """convert example to feature"""
+    features = []
+    unique_id = 1000000000
+    for (example_index, example) in enumerate(examples):
+        query_tokens = tokenizer.tokenize(example.question_text)
+        if len(query_tokens) > max_query_length:
+            query_tokens = query_tokens[0:max_query_length]
+        tok_to_orig_index = []
+        orig_to_tok_index = []
+        all_doc_tokens = []
+        for (i, token) in enumerate(example.doc_tokens):
+            orig_to_tok_index.append(len(all_doc_tokens))
+            sub_tokens = tokenizer.tokenize(token)
+            for sub_token in sub_tokens:
+                tok_to_orig_index.append(i)
+                all_doc_tokens.append(sub_token)
+        #log.info(orig_to_tok_index, example.start_position)
+        tok_start_position = None
+        tok_end_position = None
+        if is_training:
+            tok_start_position = orig_to_tok_index[example.start_position]
+            if example.end_position < len(example.doc_tokens) - 1:
+                tok_end_position = orig_to_tok_index[example.end_position +
+                                                     1] - 1
+            else:
+                tok_end_position = len(all_doc_tokens) - 1
+            (tok_start_position, tok_end_position) = _improve_answer_span(
+                all_doc_tokens, tok_start_position, tok_end_position,
+                tokenizer, example.orig_answer_text)
+        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+        _DocSpan = namedtuple("DocSpan", ["start", "length"])
+        doc_spans = []
+        start_offset = 0
+        while start_offset < len(all_doc_tokens):
+            length = len(all_doc_tokens) - start_offset
+            if length > max_tokens_for_doc:
+                length = max_tokens_for_doc
+            doc_spans.append(_DocSpan(start=start_offset, length=length))
+            if start_offset + length == len(all_doc_tokens):
+                break
+            start_offset += min(length, doc_stride)
+        for (doc_span_index, doc_span) in enumerate(doc_spans):
+            tokens = []
+            token_to_orig_map = {}
+            token_is_max_context = {}
+            text_type_ids = []
+            tokens.append("[CLS]")
+            text_type_ids.append(0)
+            for token in query_tokens:
+                tokens.append(token)
+                text_type_ids.append(0)
+            tokens.append("[SEP]")
+            text_type_ids.append(0)
+            for i in range(doc_span.length):
+                split_token_index = doc_span.start + i
+                token_to_orig_map[len(tokens)] = tok_to_orig_index[
+                    split_token_index]
+                is_max_context = _check_is_max_context(
+                    doc_spans, doc_span_index, split_token_index)
+                token_is_max_context[len(tokens)] = is_max_context
+                tokens.append(all_doc_tokens[split_token_index])
+                text_type_ids.append(1)
+            tokens.append("[SEP]")
+            text_type_ids.append(1)
+            token_ids = tokenizer.convert_tokens_to_ids(tokens)
+            position_ids = list(range(len(token_ids)))
+            start_position = None
+            end_position = None
+            if is_training:
+                doc_start = doc_span.start
+                doc_end = doc_span.start + doc_span.length - 1
+                out_of_span = False
+                if not (tok_start_position >= doc_start and
+                        tok_end_position <= doc_end):
+                    out_of_span = True
+                if out_of_span:
+                    start_position = 0
+                    end_position = 0
+                else:
+                    doc_offset = len(query_tokens) + 2
+                    start_position = tok_start_position - doc_start + doc_offset
+                    end_position = tok_end_position - doc_start + doc_offset
+            feature = Feature(
+                unique_id=unique_id,
+                example_index=example_index,
+                doc_span_index=doc_span_index,
+                tokens=tokens,
+                token_to_orig_map=token_to_orig_map,
+                token_is_max_context=token_is_max_context,
+                token_ids=token_ids,
+                position_ids=position_ids,
+                text_type_ids=text_type_ids,
+                start_position=start_position,
+                end_position=end_position)
+            features.append(feature)
+            unique_id += 1
+    return features
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='main')
+    parser.add_argument("--input", type=str, default=None)
+    args = parser.parse_args()
+    from ernie.tokenizing_ernie import ErnieTokenizer
+    tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')
+    examples = read_files(args.input, True)
+    features = convert_example_to_features(examples, 512, tokenizer, True)
+    log.debug(len(examples))
+    log.debug(len(features))
--- a/ernie_gram/optimization.py
+++ b/ernie_gram/optimization.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+import logging
+import re
+from paddle.fluid import framework
+from paddle.fluid.framework import Variable, default_main_program
+import numpy as np
+import paddle as P
+import paddle.distributed.fleet as fleet
+from propeller.paddle.train.hooks import RunHook
+import paddle.fluid as F
+log = logging.getLogger(__name__)
+from ernie_gram.utils import create_if_not_exists, get_warmup_and_linear_decay
+class AdamW(P.optimizer.AdamW):
+    """AdamW object for dygraph"""
+    def __init__(self, *args, **kwargs):
+        layerwise_lr_decay = kwargs.pop('layerwise_lr_decay_rate', 0.8) 
+        n_layers = kwargs.pop('n_layers', 12) 
+        var_name_to_exclude = kwargs.pop('var_name_to_exclude', '.*layer_norm_scale|.*layer_norm_bias|.*b_0')
+        super(AdamW, self).__init__(*args, **kwargs)
+        self.ld = layerwise_lr_decay
+        self.pat = re.compile(var_name_to_exclude)
+        self.n_layers = n_layers
+    def _get_layerwise_lr_decay_rate(self, param):
+        #if self.pat.match(param.name):
+        #    return 1.0
+        if param.name.startswith("encoder_layer"):
+            layer = int(param.name.split("_")[2])
+            decay_rate = self.ld ** (self.n_layers - layer)
+        elif "embedding" in param.name:
+            decay_rate = self.ld ** (self.n_layers + 1)
+        else:
+            decay_rate = 1.0
+        return decay_rate
+    def _create_param_lr(self, param_and_grad):
+        # create learning rate tensor for every parameter
+        param = param_and_grad[0]
+        param_lr = param.optimize_attr['learning_rate'] * self._get_layerwise_lr_decay_rate(param)
+        if type(param_lr) == Variable:
+            return param_lr
+        else:
+            if param_lr == 1.0:
+                return self._global_learning_rate()
+            else:
+                with default_main_program()._lr_schedule_guard(
+                        is_with_opt=True), framework.name_scope(
+                            'scale_with_param_lr'):
+                    return self._global_learning_rate() * param_lr
+    def apply_optimize(self, loss, startup_program, params_grads):
+        super(AdamW, self).apply_optimize(loss, startup_program, params_grads)
+        for p, g in params_grads:
+            #log.debug(L.reduce_mean(p))
+            if not self.pat.match(p.name):
+                L.assign(p * (1. - self.wd * self.current_step_lr()), p)
+def optimization(
+        loss,
+        warmup_steps,
+        num_train_steps,
+        learning_rate,
+        train_program,
+        startup_prog,
+        weight_decay,
+        scheduler='linear_warmup_decay',
+        use_fp16=False, ):
+    """do backword for static"""
+    def exclude_from_weight_decay(param):
+        name = param.rstrip('.master')
+        if name.find("layer_norm") > -1:
+            return True
+        bias_suffix = ["_bias", "_b", ".b_0"]
+        for suffix in bias_suffix:
+            if name.endswith(suffix):
+                return True
+        return False
+    g_clip = P.nn.ClipGradByGlobalNorm(1.0)
+    lr_scheduler = P.optimizer.lr.LambdaDecay(
+        learning_rate,
+        get_warmup_and_linear_decay(num_train_steps, warmup_steps))
+    optimizer = AdamW(
+        learning_rate=lr_scheduler,
+        weight_decay=weight_decay,
+        grad_clip=g_clip,
+        apply_decay_param_fun=exclude_from_weight_decay)
+    if use_fp16:
+        log.info('AMP activated')
+        if weight_decay > 0.:
+            raise ValueError(
+                'paddle amp will ignore `weight_decay`, see https://github.com/PaddlePaddle/Paddle/issues/29794'
+            )
+        #amp_list = P.fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
+        #    custom_white_list=['softmax', 'layer_norm', 'gelu'])
+        optimizer = P.fluid.contrib.mixed_precision.decorate(
+            optimizer, init_loss_scaling=2**15, use_dynamic_loss_scaling=True)
+        _, param_grads = optimizer.minimize(loss)
+        loss_scaling = P.static.default_main_program().global_block().var(
+            'loss_scaling_0')
+    else:
+        _, param_grads = optimizer.minimize(loss)
+        loss_scaling = None
+    class LRStepHook(RunHook):
+        def after_run(self, _, __):
+            lr_scheduler.step()
+            log.debug('lr step: %.5f' % lr_scheduler.get_lr())
+    return LRStepHook(), loss_scaling
--- a/ernie_gram/run_cls.sh
+++ b/ernie_gram/run_cls.sh
+source $1
+python3 -m paddle.distributed.launch ./ernie_gram/finetune_classifier_distributed.py \
+    --data_dir $data_dir \
+    --max_steps $max_steps \
+    --bsz $bsz \
+    --lr $lr \
+    --label_map ${label_map:-""} \
+    --num_labels $num_labels \
+    --pair_input $pair_input \
+    --valid_steps $valid_steps \
+    --from_pretrained $from_pretrained \
+    --save_dir checkpoints
--- a/ernie_gram/run_mrc.sh
+++ b/ernie_gram/run_mrc.sh
+source $1
+export CUDA_VISIBLE_DEVICES=0
+python3 -m paddle.distributed.launch ./ernie_gram/finetune_mrc.py \
+    --train_file $train_file \
+    --dev_file $dev_file \
+    --max_steps $max_steps \
+    --lr $lr \
+    --from_pretrained $from_pretrained \
+    --save_dir checkpoints
--- a/ernie_gram/run_ner.sh
+++ b/ernie_gram/run_ner.sh
+source $1
+python3 -m paddle.distributed.launch ./ernie_gram/finetune_ner.py \
+    --data_dir $data_dir \
+    --max_steps $max_steps \
+    --epoch $epoch \
+    --lr $lr \
+    --from_pretrained $from_pretrained \
+    --save_dir checkpoints
--- a/ernie_gram/task_configs/cmrc_conf
+++ b/ernie_gram/task_configs/cmrc_conf
+train_file="data/cmrc2018/train/train.json"
+dev_file="data/cmrc2018/dev/dev.json"
+max_steps=1320
+lr=1.5e-4
+from_pretrained="ernie-gram-zh"
--- a/ernie_gram/task_configs/msra_ner_conf
+++ b/ernie_gram/task_configs/msra_ner_conf
+data_dir="data/msra_ner"
+epoch=10
+max_steps=13040
+lr=5e-5
+from_pretrained="ernie-gram-zh"
--- a/ernie_gram/task_configs/xnli_conf
+++ b/ernie_gram/task_configs/xnli_conf
+data_dir="data/xnli"
+max_steps=4600 #3 epoch
+lr=1.5e-4
+label_map='{"contradictory":0,"contradiction":0,"entailment":1,"neutral":2}'
+num_labels=3
+valid_steps=25
+from_pretrained="ernie-gram-zh"
+pair_input=1
+bsz=32
--- a/ernie_gram/utils.py
+++ b/ernie_gram/utils.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+import sys
+import argparse
+import logging
+import paddle
+class UnpackDataLoader(paddle.io.DataLoader):
+    def __init__(self, *args, **kwargs):
+        super(UnpackDataLoader, self).__init__(*args, batch_size=1, **kwargs)
+    def __iter__(self):
+        return ([yy[0] for yy in y]
+                for y in super(UnpackDataLoader, self).__iter__())
+def create_if_not_exists(dir):
+    try:
+        dir.mkdir(parents=True)
+    except FileExistsError:
+        pass
+    return dir
+def get_warmup_and_linear_decay(max_steps, warmup_steps):
+    if warmup_steps == 0:
+        return lambda step: 1.0
+    else:
+        return lambda step: min(step / warmup_steps, 1. - (step - warmup_steps) / (max_steps - warmup_steps))