Merge pull request #73 from 0YuanZhang0/add_text_case

add_sequence_tagging_api

Merge pull request #73 from 0YuanZhang0/add_text_case
add_sequence_tagging_api
701a823e · pkpk · GitHub · f5e4e123 · 83cd7562 · 701a823e
11 changed file
--- a/examples/sequence_tagging/README.md
+++ b/examples/sequence_tagging/README.md
@@ -186,14 +186,12 @@ Overall Architecture of GRU-CRF-MODEL
 ├── data/                                   # 存放数据集的目录
 ├── conf/                                   # 词典及程序默认配置的目录
 ├── images/                               # 文档图片存放位置
-├── utils/                                   # 常用工具函数
 ├── train.py                               # 训练脚本
 ├── predict.py                           # 预测脚本
 ├── eval.py                               # 词法分析评估的脚本
 ├── downloads.py                      # 用于下载数据和模型的脚本
 ├── downloads.sh                      # 用于下载数据和模型的脚本
-├── sequence_tagging.yaml             # 模型训练、预测、评估相关配置参数
-└──reader.py                           # 文件读取相关函数
+└── sequence_tagging.yaml             # 模型训练、预测、评估相关配置参数
 ```



--- a/examples/sequence_tagging/eval.py
+++ b/examples/sequence_tagging/eval.py
@@ -25,14 +25,14 @@ import math
 import argparse
 import numpy as np

-from train import SeqTagging, ChunkEval, LacLoss
-from utils.configure import PDConfig
-from utils.check import check_gpu, check_version
-from reader import LacDataset, LacDataLoader
-
 work_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(os.path.join(work_dir, "../"))
+
 from hapi.model import set_device, Input
+from hapi.text.sequence_tagging import SeqTagging, ChunkEval, LacLoss
+from hapi.text.sequence_tagging import LacDataset, LacDataLoader
+from hapi.text.sequence_tagging import check_gpu, check_version
+from hapi.text.sequence_tagging import PDConfig

 import paddle.fluid as fluid
 from paddle.fluid.layers.utils import flatten
@@ -65,7 +65,10 @@ def main(args):
        device=place)
    model.load(args.init_from_checkpoint, skip_mismatch=True)

-    model.evaluate(eval_dataset.dataloader, batch_size=args.batch_size)
+    eval_result = model.evaluate(eval_dataset.dataloader, batch_size=args.batch_size)
+    print("precison: %.5f" % (eval_result["precision"][0]))
+    print("recall: %.5f" % (eval_result["recall"][0]))
+    print("F1: %.5f" % (eval_result["F1"][0]))


 if __name__ == '__main__':

--- a/examples/sequence_tagging/predict.py
+++ b/examples/sequence_tagging/predict.py
@@ -26,14 +26,14 @@ import math
 import argparse
 import numpy as np

-from train import SeqTagging
-from utils.check import check_gpu, check_version
-from utils.configure import PDConfig
-from reader import LacDataset, LacDataLoader
-
 work_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(os.path.join(work_dir, "../"))
-from hapi.model import set_device, Input
+
+from hapi.text.sequence_tagging import SeqTagging
+from hapi.model import Input, set_device
+from hapi.text.sequence_tagging import LacDataset, LacDataLoader
+from hapi.text.sequence_tagging import check_gpu, check_version
+from hapi.text.sequence_tagging import PDConfig

 import paddle.fluid as fluid
 from paddle.fluid.layers.utils import flatten

--- a/examples/sequence_tagging/train.py
+++ b/examples/sequence_tagging/train.py
@@ -28,183 +28,15 @@ import numpy as np
 work_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(os.path.join(work_dir, "../"))

-from hapi.metrics import Metric
-from hapi.model import Model, Input, set_device
-from hapi.loss import Loss
-from hapi.text.text import SequenceTagging
-
-from utils.check import check_gpu, check_version
-from utils.configure import PDConfig
-from reader import LacDataset, LacDataLoader
+from hapi.model import Input, set_device
+from hapi.text.sequence_tagging import SeqTagging, LacLoss, ChunkEval
+from hapi.text.sequence_tagging import LacDataset, LacDataLoader
+from hapi.text.sequence_tagging import check_gpu, check_version
+from hapi.text.sequence_tagging import PDConfig

 import paddle.fluid as fluid
 from paddle.fluid.optimizer import AdamOptimizer

-__all__ = ["SeqTagging", "LacLoss", "ChunkEval"]
-
-
-class SeqTagging(Model):
-    def __init__(self, args, vocab_size, num_labels, length=None,
-                 mode="train"):
-        super(SeqTagging, self).__init__()
-        """
-        define the lexical analysis network structure
-        word: stores the input of the model
-        for_infer: a boolean value, indicating if the model to be created is for training or predicting.
-
-        return:
-            for infer: return the prediction
-            otherwise: return the prediction
-        """
-        self.mode_type = mode
-        self.word_emb_dim = args.word_emb_dim
-        self.vocab_size = vocab_size
-        self.num_labels = num_labels
-        self.grnn_hidden_dim = args.grnn_hidden_dim
-        self.emb_lr = args.emb_learning_rate if 'emb_learning_rate' in dir(
-            args) else 1.0
-        self.crf_lr = args.emb_learning_rate if 'crf_learning_rate' in dir(
-            args) else 1.0
-        self.bigru_num = args.bigru_num
-        self.batch_size = args.batch_size
-        self.init_bound = 0.1
-        self.length = length
-
-        self.sequence_tagging = SequenceTagging(
-            vocab_size=self.vocab_size,
-            num_labels=self.num_labels,
-            batch_size=self.batch_size,
-            word_emb_dim=self.word_emb_dim,
-            grnn_hidden_dim=self.grnn_hidden_dim,
-            emb_learning_rate=self.emb_lr,
-            crf_learning_rate=self.crf_lr,
-            bigru_num=self.bigru_num,
-            init_bound=self.init_bound,
-            length=self.length)
-
-    def forward(self, *inputs):
-        """
-        Configure the network
-        """
-        word = inputs[0]
-        lengths = inputs[1]
-        if self.mode_type == "train" or self.mode_type == "test":
-            target = inputs[2]
-            outputs = self.sequence_tagging(word, lengths, target)
-        else:
-            outputs = self.sequence_tagging(word, lengths)
-        return outputs
-
-
-class Chunk_eval(fluid.dygraph.Layer):
-    def __init__(self,
-                 num_chunk_types,
-                 chunk_scheme,
-                 excluded_chunk_types=None):
-        super(Chunk_eval, self).__init__()
-        self.num_chunk_types = num_chunk_types
-        self.chunk_scheme = chunk_scheme
-        self.excluded_chunk_types = excluded_chunk_types
-
-    def forward(self, input, label, seq_length=None):
-        precision = self._helper.create_variable_for_type_inference(
-            dtype="float32")
-        recall = self._helper.create_variable_for_type_inference(
-            dtype="float32")
-        f1_score = self._helper.create_variable_for_type_inference(
-            dtype="float32")
-        num_infer_chunks = self._helper.create_variable_for_type_inference(
-            dtype="int64")
-        num_label_chunks = self._helper.create_variable_for_type_inference(
-            dtype="int64")
-        num_correct_chunks = self._helper.create_variable_for_type_inference(
-            dtype="int64")
-        this_input = {"Inference": input, "Label": label}
-        if seq_length is not None:
-            this_input["SeqLength"] = seq_length
-        self._helper.append_op(
-            type='chunk_eval',
-            inputs=this_input,
-            outputs={
-                "Precision": [precision],
-                "Recall": [recall],
-                "F1-Score": [f1_score],
-                "NumInferChunks": [num_infer_chunks],
-                "NumLabelChunks": [num_label_chunks],
-                "NumCorrectChunks": [num_correct_chunks]
-            },
-            attrs={
-                "num_chunk_types": self.num_chunk_types,
-                "chunk_scheme": self.chunk_scheme,
-                "excluded_chunk_types": self.excluded_chunk_types or []
-            })
-        return (num_infer_chunks, num_label_chunks, num_correct_chunks)
-
-
-class LacLoss(Loss):
-    def __init__(self):
-        super(LacLoss, self).__init__()
-        pass
-
-    def forward(self, outputs, labels):
-        avg_cost = outputs[1]
-        return avg_cost
-
-
-class ChunkEval(Metric):
-    def __init__(self, num_labels, name=None, *args, **kwargs):
-        super(ChunkEval, self).__init__(*args, **kwargs)
-        self._init_name(name)
-        self.chunk_eval = Chunk_eval(
-            int(math.ceil((num_labels - 1) / 2.0)), "IOB")
-        self.reset()
-
-    def add_metric_op(self, *args):
-        crf_decode = args[0]
-        lengths = args[2]
-        label = args[3]
-        (num_infer_chunks, num_label_chunks,
-         num_correct_chunks) = self.chunk_eval(
-             input=crf_decode, label=label, seq_length=lengths)
-        return [num_infer_chunks, num_label_chunks, num_correct_chunks]
-
-    def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks,
-               *args, **kwargs):
-        self.infer_chunks_total += num_infer_chunks
-        self.label_chunks_total += num_label_chunks
-        self.correct_chunks_total += num_correct_chunks
-        precision = float(
-            num_correct_chunks) / num_infer_chunks if num_infer_chunks else 0
-        recall = float(
-            num_correct_chunks) / num_label_chunks if num_label_chunks else 0
-        f1_score = float(2 * precision * recall) / (
-            precision + recall) if num_correct_chunks else 0
-        return [precision, recall, f1_score]
-
-    def reset(self):
-        self.infer_chunks_total = 0
-        self.label_chunks_total = 0
-        self.correct_chunks_total = 0
-
-    def accumulate(self):
-        precision = float(
-            self.correct_chunks_total
-        ) / self.infer_chunks_total if self.infer_chunks_total else 0
-        recall = float(
-            self.correct_chunks_total
-        ) / self.label_chunks_total if self.label_chunks_total else 0
-        f1_score = float(2 * precision * recall) / (
-            precision + recall) if self.correct_chunks_total else 0
-        res = [precision, recall, f1_score]
-        return res
-
-    def _init_name(self, name):
-        name = name or 'chunk eval'
-        self._name = ['precision', 'recall', 'F1']
-
-    def name(self):
-        return self._name
-

 def main(args):
    place = set_device(args.device)

--- a/hapi/text/sequence_tagging/__init__.py
+++ b/hapi/text/sequence_tagging/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from hapi.text.sequence_tagging.reader import LacDataset as LacDataset
+from hapi.text.sequence_tagging.reader import LacDataLoader as LacDataLoader
+from hapi.text.sequence_tagging.sequence_tagging import SeqTagging as SeqTagging
+from hapi.text.sequence_tagging.sequence_tagging import Chunk_eval as Chunk_eval
+from hapi.text.sequence_tagging.sequence_tagging import LacLoss as LacLoss
+from hapi.text.sequence_tagging.sequence_tagging import ChunkEval as ChunkEval
+from hapi.text.sequence_tagging.utils.configure import PDConfig as PDConfig 
+from hapi.text.sequence_tagging.utils.check import check_gpu as check_gpu
+from hapi.text.sequence_tagging.utils.check import check_version as check_version
+
--- a/examples/sequence_tagging/reader.py
+++ b/examples/sequence_tagging/reader.py
--- a/hapi/text/sequence_tagging/sequence_tagging.py
+++ b/hapi/text/sequence_tagging/sequence_tagging.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+SequenceTagging network structure
+"""
+
+from __future__ import division
+from __future__ import print_function
+
+import io
+import os
+import sys
+import math
+import argparse
+import numpy as np
+
+from hapi.metrics import Metric
+from hapi.model import Model, Input, set_device
+from hapi.loss import Loss
+from hapi.text.text import SequenceTagging
+
+from hapi.text.sequence_tagging.utils.check import check_gpu, check_version
+from hapi.text.sequence_tagging.utils.configure import PDConfig
+
+import paddle.fluid as fluid
+from paddle.fluid.optimizer import AdamOptimizer
+
+
+class SeqTagging(Model):
+    def __init__(self, args, vocab_size, num_labels, length=None,
+                 mode="train"):
+        super(SeqTagging, self).__init__()
+        """
+        define the lexical analysis network structure
+        word: stores the input of the model
+        for_infer: a boolean value, indicating if the model to be created is for training or predicting.
+
+        return:
+            for infer: return the prediction
+            otherwise: return the prediction
+        """
+        self.mode_type = mode
+        self.word_emb_dim = args.word_emb_dim
+        self.vocab_size = vocab_size
+        self.num_labels = num_labels
+        self.grnn_hidden_dim = args.grnn_hidden_dim
+        self.emb_lr = args.emb_learning_rate if 'emb_learning_rate' in dir(
+            args) else 1.0
+        self.crf_lr = args.emb_learning_rate if 'crf_learning_rate' in dir(
+            args) else 1.0
+        self.bigru_num = args.bigru_num
+        self.batch_size = args.batch_size
+        self.init_bound = 0.1
+        self.length = length
+
+        self.sequence_tagging = SequenceTagging(
+            vocab_size=self.vocab_size,
+            num_labels=self.num_labels,
+            batch_size=self.batch_size,
+            word_emb_dim=self.word_emb_dim,
+            grnn_hidden_dim=self.grnn_hidden_dim,
+            emb_learning_rate=self.emb_lr,
+            crf_learning_rate=self.crf_lr,
+            bigru_num=self.bigru_num,
+            init_bound=self.init_bound,
+            length=self.length)
+
+    def forward(self, *inputs):
+        """
+        Configure the network
+        """
+        word = inputs[0]
+        lengths = inputs[1]
+        if self.mode_type == "train" or self.mode_type == "test":
+            target = inputs[2]
+            outputs = self.sequence_tagging(word, lengths, target)
+        else:
+            outputs = self.sequence_tagging(word, lengths)
+        return outputs
+
+
+class Chunk_eval(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_chunk_types,
+                 chunk_scheme,
+                 excluded_chunk_types=None):
+        super(Chunk_eval, self).__init__()
+        self.num_chunk_types = num_chunk_types
+        self.chunk_scheme = chunk_scheme
+        self.excluded_chunk_types = excluded_chunk_types
+
+    def forward(self, input, label, seq_length=None):
+        precision = self._helper.create_variable_for_type_inference(
+            dtype="float32")
+        recall = self._helper.create_variable_for_type_inference(
+            dtype="float32")
+        f1_score = self._helper.create_variable_for_type_inference(
+            dtype="float32")
+        num_infer_chunks = self._helper.create_variable_for_type_inference(
+            dtype="int64")
+        num_label_chunks = self._helper.create_variable_for_type_inference(
+            dtype="int64")
+        num_correct_chunks = self._helper.create_variable_for_type_inference(
+            dtype="int64")
+        this_input = {"Inference": input, "Label": label}
+        if seq_length is not None:
+            this_input["SeqLength"] = seq_length
+        self._helper.append_op(
+            type='chunk_eval',
+            inputs=this_input,
+            outputs={
+                "Precision": [precision],
+                "Recall": [recall],
+                "F1-Score": [f1_score],
+                "NumInferChunks": [num_infer_chunks],
+                "NumLabelChunks": [num_label_chunks],
+                "NumCorrectChunks": [num_correct_chunks]
+            },
+            attrs={
+                "num_chunk_types": self.num_chunk_types,
+                "chunk_scheme": self.chunk_scheme,
+                "excluded_chunk_types": self.excluded_chunk_types or []
+            })
+        return (num_infer_chunks, num_label_chunks, num_correct_chunks)
+
+
+class LacLoss(Loss):
+    def __init__(self):
+        super(LacLoss, self).__init__()
+        pass
+
+    def forward(self, outputs, labels):
+        avg_cost = outputs[1]
+        return avg_cost
+
+
+class ChunkEval(Metric):
+    def __init__(self, num_labels, name=None, *args, **kwargs):
+        super(ChunkEval, self).__init__(*args, **kwargs)
+        self._init_name(name)
+        self.chunk_eval = Chunk_eval(
+            int(math.ceil((num_labels - 1) / 2.0)), "IOB")
+        self.reset()
+
+    def add_metric_op(self, *args):
+        crf_decode = args[0]
+        lengths = args[2]
+        label = args[3]
+        (num_infer_chunks, num_label_chunks,
+         num_correct_chunks) = self.chunk_eval(
+             input=crf_decode, label=label, seq_length=lengths)
+        return [num_infer_chunks, num_label_chunks, num_correct_chunks]
+
+    def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks,
+               *args, **kwargs):
+        self.infer_chunks_total += num_infer_chunks
+        self.label_chunks_total += num_label_chunks
+        self.correct_chunks_total += num_correct_chunks
+        precision = float(
+            num_correct_chunks) / num_infer_chunks if num_infer_chunks else 0
+        recall = float(
+            num_correct_chunks) / num_label_chunks if num_label_chunks else 0
+        f1_score = float(2 * precision * recall) / (
+            precision + recall) if num_correct_chunks else 0
+        return [precision, recall, f1_score]
+
+    def reset(self):
+        self.infer_chunks_total = 0
+        self.label_chunks_total = 0
+        self.correct_chunks_total = 0
+
+    def accumulate(self):
+        precision = float(
+            self.correct_chunks_total
+        ) / self.infer_chunks_total if self.infer_chunks_total else 0
+        recall = float(
+            self.correct_chunks_total
+        ) / self.label_chunks_total if self.label_chunks_total else 0
+        f1_score = float(2 * precision * recall) / (
+            precision + recall) if self.correct_chunks_total else 0
+        res = [precision, recall, f1_score]
+        return res
+
+    def _init_name(self, name):
+        name = name or 'chunk eval'
+        self._name = ['precision', 'recall', 'F1']
+
+    def name(self):
+        return self._name
+
--- a/examples/sequence_tagging/utils/__init__.py
+++ b/examples/sequence_tagging/utils/__init__.py
--- a/examples/sequence_tagging/utils/check.py
+++ b/examples/sequence_tagging/utils/check.py
--- a/examples/sequence_tagging/utils/configure.py
+++ b/examples/sequence_tagging/utils/configure.py
--- a/examples/sequence_tagging/utils/metrics.py
+++ b/examples/sequence_tagging/utils/metrics.py