提交 50816a2d 编写于 作者: 0 0YuanZhang0

update_sequence_tagging

上级 dc437431
......@@ -25,7 +25,8 @@ from hapi.text.text import TransformerDecoderLayer as TransformerDecoderLayer
from hapi.text.text import TransformerEncoder as TransformerEncoder
from hapi.text.text import TransformerDecoder as TransformerDecoder
from hapi.text.text import TransformerBeamSearchDecoder as TransformerBeamSearchDecoder
from hapi.text.text import DynamicGRU as DynamicGRU
from hapi.text.text import GRUCell as GRUCell
from hapi.text.text import GRUEncoderCell as GRUEncoderCell
from hapi.text.text import BiGRU as BiGRU
from hapi.text.text import Linear_chain_crf as Linear_chain_crf
from hapi.text.text import Crf_decoding as Crf_decoding
......
此差异已折叠。
......@@ -6,7 +6,7 @@ Sequence Tagging,是一个序列标注模型,模型可用于实现,分词
|模型|Precision|Recall|F1-score|
|:-:|:-:|:-:|:-:|
|Lexical Analysis|89.2%|89.4%|89.3%|
|Lexical Analysis|88.26%|89.20%|88.73%|
## 2. 快速开始
......@@ -139,7 +139,7 @@ python predict.py \
--init_from_checkpoint model_baseline/params \
--output_file predict.result \
--mode predict \
--device gpu \
--device cpu \
-d
# -d: 是否使用动态图模式进行训练,如果使用静态图训练,命令行请删除-d参数
......@@ -157,7 +157,7 @@ python eval.py \
--label_dict_path ./conf/tag.dic \
--word_rep_dict_path ./conf/q2b.dic \
--init_from_checkpoint ./model_baseline/params \
--device gpu \
--device cpu \
-d
# -d: 是否使用动态图模式进行训练,如果使用静态图训练,命令行请删除-d参数
......@@ -189,7 +189,10 @@ python eval.py \
### 模型原理介绍
上面介绍的模型原理如下图所示:<br />
![GRU-CRF-MODEL](./images/gru-crf-model.png)
<p align="center">
<img src="./images/gru-crf-model.png" width = "340" height = "300" /> <br />
Overall Architecture of GRU-CRF-MODEL
</p>
### 数据格式
训练使用的数据可以由用户根据实际的应用场景,自己组织数据。除了第一行是 `text_a\tlabel` 固定的开头,后面的每行数据都是由两列组成,以制表符分隔,第一列是 utf-8 编码的中文文本,以 `\002` 分割,第二列是对应每个字的标注,以 `\002` 分隔。我们采用 IOB2 标注体系,即以 X-B 作为类型为 X 的词的开始,以 X-I 作为类型为 X 的词的持续,以 O 表示不关注的字(实际上,在词性、专名联合标注中,不存在 O )。示例如下:
......
......@@ -25,8 +25,9 @@ import math
import argparse
import numpy as np
from train import SeqTagging, Chunk_eval
from train import SeqTagging
from utils.check import check_gpu, check_version
from utils.metrics import chunk_count
from reader import LacDataset, create_lexnet_data_generator, create_dataloader
work_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
......@@ -42,14 +43,13 @@ def main(args):
place = set_device(args.device)
fluid.enable_dygraph(place) if args.dynamic else None
inputs = [Input([None, args.max_seq_len], 'int64', name='words'),
Input([None], 'int64', name='length')]
inputs = [Input([None, None], 'int64', name='words'),
Input([None], 'int64', name='length')]
feed_list = None if args.dynamic else [x.forward() for x in inputs]
dataset = LacDataset(args)
eval_path = args.test_file
chunk_eval = Chunk_eval(int(math.ceil((dataset.num_labels - 1) / 2.0)), "IOB")
chunk_evaluator = fluid.metrics.ChunkEvaluator()
chunk_evaluator.reset()
......@@ -69,25 +69,23 @@ def main(args):
model.mode = "test"
model.prepare(inputs=inputs)
model.load(args.init_from_checkpoint)
f = open(args.output_file, "wb")
for data in eval_dataset():
words, lens, targets, targets = data
crf_decode, length = model.test(inputs=flatten(data))
crf_decode = fluid.dygraph.to_variable(crf_decode)
length = fluid.dygraph.to_variable(length)
(num_infer_chunks, num_label_chunks, num_correct_chunks) = chunk_eval(
input=crf_decode,
label=targets,
seq_length=length)
print(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
chunk_evaluator.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
model.load(args.init_from_checkpoint, skip_mismatch=True)
for data in eval_dataset():
if len(data) == 1:
batch_data = data[0]
targets = np.array(batch_data[2])
else:
batch_data = data
targets = batch_data[2].numpy()
inputs_data = [batch_data[0], batch_data[1]]
crf_decode, length = model.test(inputs=inputs_data)
num_infer_chunks, num_label_chunks, num_correct_chunks = chunk_count(crf_decode, targets, length, dataset.id2label_dict)
chunk_evaluator.update(num_infer_chunks, num_label_chunks, num_correct_chunks)
precision, recall, f1 = chunk_evaluator.eval()
print("[test] P: %.5f, R: %.5f, F1: %.5f" % (precision, recall, f1))
if __name__ == '__main__':
parser = argparse.ArgumentParser("sequence tagging training")
......@@ -176,7 +174,8 @@ if __name__ == '__main__':
args = parser.parse_args()
print(args)
check_gpu(args.device)
use_gpu = True if args.device == "gpu" else False
check_gpu(use_gpu)
check_version()
main(args)
......@@ -42,7 +42,7 @@ def main(args):
place = set_device(args.device)
fluid.enable_dygraph(place) if args.dynamic else None
inputs = [Input([None, args.max_seq_len], 'int64', name='words'),
inputs = [Input([None, None], 'int64', name='words'),
Input([None], 'int64', name='length')]
feed_list = None if args.dynamic else [x.forward() for x in inputs]
......@@ -70,8 +70,11 @@ def main(args):
f = open(args.output_file, "wb")
for data in predict_dataset():
results, length = model.test(inputs=flatten(data))
#length_list = np.fromstring(length, dtype=str)
if len(data) == 1:
input_data = data[0]
else:
input_data = data
results, length = model.test(inputs=flatten(input_data))
for i in range(len(results)):
word_len = length[i]
word_ids = results[i][: word_len]
......@@ -162,7 +165,8 @@ if __name__ == '__main__':
args = parser.parse_args()
print(args)
check_gpu(args.device)
use_gpu = True if args.device == "gpu" else False
check_gpu(use_gpu)
check_version()
main(args)
......@@ -21,7 +21,7 @@ from __future__ import print_function
import io
import numpy as np
import paddle.fluid as fluid
import paddle
class LacDataset(object):
......@@ -120,7 +120,7 @@ class LacDataset(object):
def wrapper():
fread = io.open(filename, "r", encoding="utf-8")
if mode == "train" or mode == "test":
if mode == "train":
headline = next(fread)
headline = headline.strip().split('\t')
assert len(headline) == 2 and headline[0] == "text_a" and headline[
......@@ -133,6 +133,8 @@ class LacDataset(object):
word_ids = self.word_to_ids(words.split("\002"))
label_ids = self.label_to_ids(labels.split("\002"))
assert len(word_ids) == len(label_ids)
words_len = np.int64(len(word_ids))
word_ids = word_ids[0:max_seq_len]
words_len = np.int64(len(word_ids))
word_ids += [0 for _ in range(max_seq_len - words_len)]
......@@ -140,6 +142,21 @@ class LacDataset(object):
label_ids += [0 for _ in range(max_seq_len - words_len)]
assert len(word_ids) == len(label_ids)
yield word_ids, label_ids, words_len
elif mode == "test":
headline = next(fread)
headline = headline.strip().split('\t')
assert len(headline) == 2 and headline[0] == "text_a" and headline[
1] == "label"
buf = []
for line in fread:
words, labels = line.strip("\n").split("\t")
if len(words) < 1:
continue
word_ids = self.word_to_ids(words.split("\002"))
label_ids = self.label_to_ids(labels.split("\002"))
assert len(word_ids) == len(label_ids)
words_len = np.int64(len(word_ids))
yield word_ids, label_ids, words_len
else:
for line in fread:
words = line.strip("\n").split('\t')[0]
......@@ -157,9 +174,16 @@ class LacDataset(object):
return wrapper
def create_lexnet_data_generator(args, reader, file_name, place, mode="train"):
def create_lexnet_data_generator(args, reader, file_name, place, mode="train"):
def padding_data(max_len, batch_data):
padding_batch_data = []
for data in batch_data:
data += [0 for _ in range(max_len - len(data))]
padding_batch_data.append(data)
return padding_batch_data
def wrapper():
if mode == "train" or mode == "test":
if mode == "train":
batch_words, batch_labels, seq_lens = [], [], []
for epoch in xrange(args.epoch):
for instance in reader.file_reader(
......@@ -169,12 +193,32 @@ def create_lexnet_data_generator(args, reader, file_name, place, mode="train"):
batch_words.append(words)
batch_labels.append(labels)
seq_lens.append(words_len)
if len(seq_lens) == args.batch_size:
if len(seq_lens) == args.batch_size:
yield batch_words, seq_lens, batch_labels, batch_labels
batch_words, batch_labels, seq_lens = [], [], []
if len(seq_lens) > 0:
yield batch_words, seq_lens, batch_labels, batch_labels
elif mode == "test":
batch_words, batch_labels, seq_lens, max_len = [], [], [], 0
for instance in reader.file_reader(
file_name, mode, max_seq_len=args.max_seq_len)():
words, labels, words_len = instance
max_len = words_len if words_len > max_len else max_len
if len(seq_lens) < args.batch_size:
batch_words.append(words)
seq_lens.append(words_len)
batch_labels.append(labels)
if len(seq_lens) == args.batch_size:
padding_batch_words = padding_data(max_len, batch_words)
padding_batch_labels = padding_data(max_len, batch_labels)
yield padding_batch_words, seq_lens, padding_batch_labels, padding_batch_labels
batch_words, batch_labels, seq_lens, max_len = [], [], [], 0
if len(seq_lens) > 0:
padding_batch_words = padding_data(max_len, batch_words)
padding_batch_labels = padding_data(max_len, batch_labels)
yield padding_batch_words, seq_lens, padding_batch_labels, padding_batch_labels
else:
batch_words, seq_lens, max_len = [], [], 0
for instance in reader.file_reader(
......@@ -183,20 +227,13 @@ def create_lexnet_data_generator(args, reader, file_name, place, mode="train"):
if len(seq_lens) < args.batch_size:
batch_words.append(words)
seq_lens.append(words_len)
if words_len > max_len:
max_len = words_len
if len(seq_lens) == args.batch_size:
padding_batch_words = []
for words in batch_words:
words += [0 for _ in range(max_len - len(words))]
padding_batch_words.append(words)
max_len = words_len if words_len > max_len else max_len
if len(seq_lens) == args.batch_size:
padding_batch_words = padding_data(max_len, batch_words)
yield padding_batch_words, seq_lens
batch_words, seq_lens, max_len = [], [], 0
if len(seq_lens) > 0:
padding_batch_words = []
for words in batch_words:
words += [0 for _ in range(max_len - len(words))]
padding_batch_words.append(words)
padding_batch_words = padding_data(max_len, batch_words)
yield padding_batch_words, seq_lens
return wrapper
......@@ -204,13 +241,13 @@ def create_lexnet_data_generator(args, reader, file_name, place, mode="train"):
def create_dataloader(generator, place, feed_list=None):
if not feed_list:
data_loader = fluid.io.DataLoader.from_generator(
data_loader = paddle.io.DataLoader.from_generator(
capacity=50,
use_double_buffer=True,
iterable=True,
return_list=True)
else:
data_loader = fluid.io.DataLoader.from_generator(
data_loader = paddle.io.DataLoader.from_generator(
feed_list=feed_list,
capacity=50,
use_double_buffer=True,
......
......@@ -154,9 +154,10 @@ class ChunkEval(Metric):
int(math.ceil((num_labels - 1) / 2.0)), "IOB")
self.reset()
def add_metric_op(self, pred, label, *args, **kwargs):
crf_decode = pred[0]
lengths = pred[2]
def add_metric_op(self, *args):
crf_decode = args[0]
lengths = args[2]
label = args[3]
(num_infer_chunks, num_label_chunks,
num_correct_chunks) = self.chunk_eval(
input=crf_decode, label=label, seq_length=lengths)
......@@ -204,11 +205,11 @@ def main(args):
place = set_device(args.device)
fluid.enable_dygraph(place) if args.dynamic else None
inputs = [Input([None, args.max_seq_len], 'int64', name='words'),
inputs = [Input([None, None], 'int64', name='words'),
Input([None], 'int64', name='length'),
Input([None, args.max_seq_len], 'int64', name='target')]
Input([None, None], 'int64', name='target')]
labels = [Input([None, args.max_seq_len], 'int64', name='labels')]
labels = [Input([None, None], 'int64', name='labels')]
feed_list = None if args.dynamic else [x.forward() for x in inputs + labels]
dataset = LacDataset(args)
......@@ -343,7 +344,8 @@ if __name__ == '__main__':
args = parser.parse_args()
print(args)
check_gpu(args.device)
use_gpu = True if args.device == "gpu" else False
check_gpu(use_gpu)
check_version()
main(args)
export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m paddle.distributed.launch --selected_gpus=0,1,2,3 train.py \
--train_file ./data/train.tsv \
--test_file ./data/test.tsv \
--word_dict_path ./data/word.dic \
--label_dict_path ./data/tag.dic \
--word_rep_dict_path ./data/q2b.dic \
--device gpu \
--grnn_hidden_dim 128 \
--word_emb_dim 128 \
--bigru_num 2 \
--base_learning_rate 1e-3 \
--batch_size 300 \
--epoch 10 \
--save_dir ./model \
-d
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
import paddle.fluid as fluid
__all__ = ['chunk_count', "build_chunk"]
def build_chunk(data_list, id2label_dict):
"""
Assembly entity
"""
tag_list = [id2label_dict.get(str(id)) for id in data_list]
ner_dict = {}
ner_str = ""
ner_start = 0
for i in range(len(tag_list)):
tag = tag_list[i]
if tag == u"O":
if i != 0:
key = "%d_%d" % (ner_start, i - 1)
ner_dict[key] = ner_str
ner_start = i
ner_str = tag
elif tag.endswith(u"B"):
if i != 0:
key = "%d_%d" % (ner_start, i - 1)
ner_dict[key] = ner_str
ner_start = i
ner_str = tag.split('-')[0]
elif tag.endswith(u"I"):
if tag.split('-')[0] != ner_str:
if i != 0:
key = "%d_%d" % (ner_start, i - 1)
ner_dict[key] = ner_str
ner_start = i
ner_str = tag.split('-')[0]
return ner_dict
def chunk_count(infer_numpy, label_numpy, seq_len, id2label_dict):
"""
calculate num_correct_chunks num_error_chunks total_num for metrics
"""
num_infer_chunks, num_label_chunks, num_correct_chunks = 0, 0, 0
assert infer_numpy.shape[0] == label_numpy.shape[0]
for i in range(infer_numpy.shape[0]):
infer_list = infer_numpy[i][: seq_len[i]]
label_list = label_numpy[i][: seq_len[i]]
infer_dict = build_chunk(infer_list, id2label_dict)
num_infer_chunks += len(infer_dict)
label_dict = build_chunk(label_list, id2label_dict)
num_label_chunks += len(label_dict)
for key in infer_dict:
if key in label_dict and label_dict[key] == infer_dict[key]:
num_correct_chunks += 1
return num_infer_chunks, num_label_chunks, num_correct_chunks
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册