Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
hapi
提交
50816a2d
H
hapi
项目概览
PaddlePaddle
/
hapi
通知
11
Star
2
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
4
列表
看板
标记
里程碑
合并请求
7
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
H
hapi
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
4
Issue
4
列表
看板
标记
里程碑
合并请求
7
合并请求
7
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
50816a2d
编写于
4月 17, 2020
作者:
0
0YuanZhang0
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update_sequence_tagging
上级
dc437431
变更
9
展开全部
隐藏空白更改
内联
并排
Showing
9 changed file
with
335 addition
and
676 deletion
+335
-676
hapi/text/__init__.py
hapi/text/__init__.py
+2
-1
hapi/text/text.py
hapi/text/text.py
+158
-604
sequence_tagging/README.md
sequence_tagging/README.md
+7
-4
sequence_tagging/eval.py
sequence_tagging/eval.py
+20
-21
sequence_tagging/predict.py
sequence_tagging/predict.py
+8
-4
sequence_tagging/reader.py
sequence_tagging/reader.py
+55
-18
sequence_tagging/train.py
sequence_tagging/train.py
+9
-7
sequence_tagging/train.sh
sequence_tagging/train.sh
+0
-17
sequence_tagging/utils/metrics.py
sequence_tagging/utils/metrics.py
+76
-0
未找到文件。
hapi/text/__init__.py
浏览文件 @
50816a2d
...
...
@@ -25,7 +25,8 @@ from hapi.text.text import TransformerDecoderLayer as TransformerDecoderLayer
from
hapi.text.text
import
TransformerEncoder
as
TransformerEncoder
from
hapi.text.text
import
TransformerDecoder
as
TransformerDecoder
from
hapi.text.text
import
TransformerBeamSearchDecoder
as
TransformerBeamSearchDecoder
from
hapi.text.text
import
DynamicGRU
as
DynamicGRU
from
hapi.text.text
import
GRUCell
as
GRUCell
from
hapi.text.text
import
GRUEncoderCell
as
GRUEncoderCell
from
hapi.text.text
import
BiGRU
as
BiGRU
from
hapi.text.text
import
Linear_chain_crf
as
Linear_chain_crf
from
hapi.text.text
import
Crf_decoding
as
Crf_decoding
...
...
hapi/text/text.py
浏览文件 @
50816a2d
此差异已折叠。
点击以展开。
sequence_tagging/README.md
浏览文件 @
50816a2d
...
...
@@ -6,7 +6,7 @@ Sequence Tagging,是一个序列标注模型,模型可用于实现,分词
|模型|Precision|Recall|F1-score|
|:-:|:-:|:-:|:-:|
|Lexical Analysis|8
9.2%|89.4%|89.
3%|
|Lexical Analysis|8
8.26%|89.20%|88.7
3%|
## 2. 快速开始
...
...
@@ -139,7 +139,7 @@ python predict.py \
--init_from_checkpoint
model_baseline/params
\
--output_file
predict.result
\
--mode
predict
\
--device
g
pu
\
--device
c
pu
\
-d
# -d: 是否使用动态图模式进行训练,如果使用静态图训练,命令行请删除-d参数
...
...
@@ -157,7 +157,7 @@ python eval.py \
--label_dict_path
./conf/tag.dic
\
--word_rep_dict_path
./conf/q2b.dic
\
--init_from_checkpoint
./model_baseline/params
\
--device
g
pu
\
--device
c
pu
\
-d
# -d: 是否使用动态图模式进行训练,如果使用静态图训练,命令行请删除-d参数
...
...
@@ -189,7 +189,10 @@ python eval.py \
### 模型原理介绍
上面介绍的模型原理如下图所示:
<br
/>
![
GRU-CRF-MODEL
](
./images/gru-crf-model.png
)
<p
align=
"center"
>
<img
src=
"./images/gru-crf-model.png"
width =
"340"
height =
"300"
/>
<br
/>
Overall Architecture of GRU-CRF-MODEL
</p>
### 数据格式
训练使用的数据可以由用户根据实际的应用场景,自己组织数据。除了第一行是
`text_a\tlabel`
固定的开头,后面的每行数据都是由两列组成,以制表符分隔,第一列是 utf-8 编码的中文文本,以
`\002`
分割,第二列是对应每个字的标注,以
`\002`
分隔。我们采用 IOB2 标注体系,即以 X-B 作为类型为 X 的词的开始,以 X-I 作为类型为 X 的词的持续,以 O 表示不关注的字(实际上,在词性、专名联合标注中,不存在 O )。示例如下:
...
...
sequence_tagging/eval.py
浏览文件 @
50816a2d
...
...
@@ -25,8 +25,9 @@ import math
import
argparse
import
numpy
as
np
from
train
import
SeqTagging
,
Chunk_eval
from
train
import
SeqTagging
from
utils.check
import
check_gpu
,
check_version
from
utils.metrics
import
chunk_count
from
reader
import
LacDataset
,
create_lexnet_data_generator
,
create_dataloader
work_dir
=
os
.
path
.
dirname
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)))
...
...
@@ -42,14 +43,13 @@ def main(args):
place
=
set_device
(
args
.
device
)
fluid
.
enable_dygraph
(
place
)
if
args
.
dynamic
else
None
inputs
=
[
Input
([
None
,
args
.
max_seq_len
],
'int64'
,
name
=
'words'
),
Input
([
None
],
'int64'
,
name
=
'length'
)]
inputs
=
[
Input
([
None
,
None
],
'int64'
,
name
=
'words'
),
Input
([
None
],
'int64'
,
name
=
'length'
)]
feed_list
=
None
if
args
.
dynamic
else
[
x
.
forward
()
for
x
in
inputs
]
dataset
=
LacDataset
(
args
)
eval_path
=
args
.
test_file
chunk_eval
=
Chunk_eval
(
int
(
math
.
ceil
((
dataset
.
num_labels
-
1
)
/
2.0
)),
"IOB"
)
chunk_evaluator
=
fluid
.
metrics
.
ChunkEvaluator
()
chunk_evaluator
.
reset
()
...
...
@@ -69,25 +69,23 @@ def main(args):
model
.
mode
=
"test"
model
.
prepare
(
inputs
=
inputs
)
model
.
load
(
args
.
init_from_checkpoint
)
f
=
open
(
args
.
output_file
,
"wb"
)
for
data
in
eval_dataset
():
words
,
lens
,
targets
,
targets
=
data
crf_decode
,
length
=
model
.
test
(
inputs
=
flatten
(
data
))
crf_decode
=
fluid
.
dygraph
.
to_variable
(
crf_decode
)
length
=
fluid
.
dygraph
.
to_variable
(
length
)
(
num_infer_chunks
,
num_label_chunks
,
num_correct_chunks
)
=
chunk_eval
(
input
=
crf_decode
,
label
=
targets
,
seq_length
=
length
)
print
(
num_infer_chunks
.
numpy
(),
num_label_chunks
.
numpy
(),
num_correct_chunks
.
numpy
())
chunk_evaluator
.
update
(
num_infer_chunks
.
numpy
(),
num_label_chunks
.
numpy
(),
num_correct_chunks
.
numpy
())
model
.
load
(
args
.
init_from_checkpoint
,
skip_mismatch
=
True
)
for
data
in
eval_dataset
():
if
len
(
data
)
==
1
:
batch_data
=
data
[
0
]
targets
=
np
.
array
(
batch_data
[
2
])
else
:
batch_data
=
data
targets
=
batch_data
[
2
].
numpy
()
inputs_data
=
[
batch_data
[
0
],
batch_data
[
1
]]
crf_decode
,
length
=
model
.
test
(
inputs
=
inputs_data
)
num_infer_chunks
,
num_label_chunks
,
num_correct_chunks
=
chunk_count
(
crf_decode
,
targets
,
length
,
dataset
.
id2label_dict
)
chunk_evaluator
.
update
(
num_infer_chunks
,
num_label_chunks
,
num_correct_chunks
)
precision
,
recall
,
f1
=
chunk_evaluator
.
eval
()
print
(
"[test] P: %.5f, R: %.5f, F1: %.5f"
%
(
precision
,
recall
,
f1
))
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
"sequence tagging training"
)
...
...
@@ -176,7 +174,8 @@ if __name__ == '__main__':
args
=
parser
.
parse_args
()
print
(
args
)
check_gpu
(
args
.
device
)
use_gpu
=
True
if
args
.
device
==
"gpu"
else
False
check_gpu
(
use_gpu
)
check_version
()
main
(
args
)
sequence_tagging/predict.py
浏览文件 @
50816a2d
...
...
@@ -42,7 +42,7 @@ def main(args):
place
=
set_device
(
args
.
device
)
fluid
.
enable_dygraph
(
place
)
if
args
.
dynamic
else
None
inputs
=
[
Input
([
None
,
args
.
max_seq_len
],
'int64'
,
name
=
'words'
),
inputs
=
[
Input
([
None
,
None
],
'int64'
,
name
=
'words'
),
Input
([
None
],
'int64'
,
name
=
'length'
)]
feed_list
=
None
if
args
.
dynamic
else
[
x
.
forward
()
for
x
in
inputs
]
...
...
@@ -70,8 +70,11 @@ def main(args):
f
=
open
(
args
.
output_file
,
"wb"
)
for
data
in
predict_dataset
():
results
,
length
=
model
.
test
(
inputs
=
flatten
(
data
))
#length_list = np.fromstring(length, dtype=str)
if
len
(
data
)
==
1
:
input_data
=
data
[
0
]
else
:
input_data
=
data
results
,
length
=
model
.
test
(
inputs
=
flatten
(
input_data
))
for
i
in
range
(
len
(
results
)):
word_len
=
length
[
i
]
word_ids
=
results
[
i
][:
word_len
]
...
...
@@ -162,7 +165,8 @@ if __name__ == '__main__':
args
=
parser
.
parse_args
()
print
(
args
)
check_gpu
(
args
.
device
)
use_gpu
=
True
if
args
.
device
==
"gpu"
else
False
check_gpu
(
use_gpu
)
check_version
()
main
(
args
)
sequence_tagging/reader.py
浏览文件 @
50816a2d
...
...
@@ -21,7 +21,7 @@ from __future__ import print_function
import
io
import
numpy
as
np
import
paddle
.fluid
as
fluid
import
paddle
class
LacDataset
(
object
):
...
...
@@ -120,7 +120,7 @@ class LacDataset(object):
def
wrapper
():
fread
=
io
.
open
(
filename
,
"r"
,
encoding
=
"utf-8"
)
if
mode
==
"train"
or
mode
==
"test"
:
if
mode
==
"train"
:
headline
=
next
(
fread
)
headline
=
headline
.
strip
().
split
(
'
\t
'
)
assert
len
(
headline
)
==
2
and
headline
[
0
]
==
"text_a"
and
headline
[
...
...
@@ -133,6 +133,8 @@ class LacDataset(object):
word_ids
=
self
.
word_to_ids
(
words
.
split
(
"
\002
"
))
label_ids
=
self
.
label_to_ids
(
labels
.
split
(
"
\002
"
))
assert
len
(
word_ids
)
==
len
(
label_ids
)
words_len
=
np
.
int64
(
len
(
word_ids
))
word_ids
=
word_ids
[
0
:
max_seq_len
]
words_len
=
np
.
int64
(
len
(
word_ids
))
word_ids
+=
[
0
for
_
in
range
(
max_seq_len
-
words_len
)]
...
...
@@ -140,6 +142,21 @@ class LacDataset(object):
label_ids
+=
[
0
for
_
in
range
(
max_seq_len
-
words_len
)]
assert
len
(
word_ids
)
==
len
(
label_ids
)
yield
word_ids
,
label_ids
,
words_len
elif
mode
==
"test"
:
headline
=
next
(
fread
)
headline
=
headline
.
strip
().
split
(
'
\t
'
)
assert
len
(
headline
)
==
2
and
headline
[
0
]
==
"text_a"
and
headline
[
1
]
==
"label"
buf
=
[]
for
line
in
fread
:
words
,
labels
=
line
.
strip
(
"
\n
"
).
split
(
"
\t
"
)
if
len
(
words
)
<
1
:
continue
word_ids
=
self
.
word_to_ids
(
words
.
split
(
"
\002
"
))
label_ids
=
self
.
label_to_ids
(
labels
.
split
(
"
\002
"
))
assert
len
(
word_ids
)
==
len
(
label_ids
)
words_len
=
np
.
int64
(
len
(
word_ids
))
yield
word_ids
,
label_ids
,
words_len
else
:
for
line
in
fread
:
words
=
line
.
strip
(
"
\n
"
).
split
(
'
\t
'
)[
0
]
...
...
@@ -157,9 +174,16 @@ class LacDataset(object):
return
wrapper
def
create_lexnet_data_generator
(
args
,
reader
,
file_name
,
place
,
mode
=
"train"
):
def
create_lexnet_data_generator
(
args
,
reader
,
file_name
,
place
,
mode
=
"train"
):
def
padding_data
(
max_len
,
batch_data
):
padding_batch_data
=
[]
for
data
in
batch_data
:
data
+=
[
0
for
_
in
range
(
max_len
-
len
(
data
))]
padding_batch_data
.
append
(
data
)
return
padding_batch_data
def
wrapper
():
if
mode
==
"train"
or
mode
==
"test"
:
if
mode
==
"train"
:
batch_words
,
batch_labels
,
seq_lens
=
[],
[],
[]
for
epoch
in
xrange
(
args
.
epoch
):
for
instance
in
reader
.
file_reader
(
...
...
@@ -169,12 +193,32 @@ def create_lexnet_data_generator(args, reader, file_name, place, mode="train"):
batch_words
.
append
(
words
)
batch_labels
.
append
(
labels
)
seq_lens
.
append
(
words_len
)
if
len
(
seq_lens
)
==
args
.
batch_size
:
if
len
(
seq_lens
)
==
args
.
batch_size
:
yield
batch_words
,
seq_lens
,
batch_labels
,
batch_labels
batch_words
,
batch_labels
,
seq_lens
=
[],
[],
[]
if
len
(
seq_lens
)
>
0
:
yield
batch_words
,
seq_lens
,
batch_labels
,
batch_labels
elif
mode
==
"test"
:
batch_words
,
batch_labels
,
seq_lens
,
max_len
=
[],
[],
[],
0
for
instance
in
reader
.
file_reader
(
file_name
,
mode
,
max_seq_len
=
args
.
max_seq_len
)():
words
,
labels
,
words_len
=
instance
max_len
=
words_len
if
words_len
>
max_len
else
max_len
if
len
(
seq_lens
)
<
args
.
batch_size
:
batch_words
.
append
(
words
)
seq_lens
.
append
(
words_len
)
batch_labels
.
append
(
labels
)
if
len
(
seq_lens
)
==
args
.
batch_size
:
padding_batch_words
=
padding_data
(
max_len
,
batch_words
)
padding_batch_labels
=
padding_data
(
max_len
,
batch_labels
)
yield
padding_batch_words
,
seq_lens
,
padding_batch_labels
,
padding_batch_labels
batch_words
,
batch_labels
,
seq_lens
,
max_len
=
[],
[],
[],
0
if
len
(
seq_lens
)
>
0
:
padding_batch_words
=
padding_data
(
max_len
,
batch_words
)
padding_batch_labels
=
padding_data
(
max_len
,
batch_labels
)
yield
padding_batch_words
,
seq_lens
,
padding_batch_labels
,
padding_batch_labels
else
:
batch_words
,
seq_lens
,
max_len
=
[],
[],
0
for
instance
in
reader
.
file_reader
(
...
...
@@ -183,20 +227,13 @@ def create_lexnet_data_generator(args, reader, file_name, place, mode="train"):
if
len
(
seq_lens
)
<
args
.
batch_size
:
batch_words
.
append
(
words
)
seq_lens
.
append
(
words_len
)
if
words_len
>
max_len
:
max_len
=
words_len
if
len
(
seq_lens
)
==
args
.
batch_size
:
padding_batch_words
=
[]
for
words
in
batch_words
:
words
+=
[
0
for
_
in
range
(
max_len
-
len
(
words
))]
padding_batch_words
.
append
(
words
)
max_len
=
words_len
if
words_len
>
max_len
else
max_len
if
len
(
seq_lens
)
==
args
.
batch_size
:
padding_batch_words
=
padding_data
(
max_len
,
batch_words
)
yield
padding_batch_words
,
seq_lens
batch_words
,
seq_lens
,
max_len
=
[],
[],
0
if
len
(
seq_lens
)
>
0
:
padding_batch_words
=
[]
for
words
in
batch_words
:
words
+=
[
0
for
_
in
range
(
max_len
-
len
(
words
))]
padding_batch_words
.
append
(
words
)
padding_batch_words
=
padding_data
(
max_len
,
batch_words
)
yield
padding_batch_words
,
seq_lens
return
wrapper
...
...
@@ -204,13 +241,13 @@ def create_lexnet_data_generator(args, reader, file_name, place, mode="train"):
def
create_dataloader
(
generator
,
place
,
feed_list
=
None
):
if
not
feed_list
:
data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
data_loader
=
paddle
.
io
.
DataLoader
.
from_generator
(
capacity
=
50
,
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
True
)
else
:
data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
data_loader
=
paddle
.
io
.
DataLoader
.
from_generator
(
feed_list
=
feed_list
,
capacity
=
50
,
use_double_buffer
=
True
,
...
...
sequence_tagging/train.py
浏览文件 @
50816a2d
...
...
@@ -154,9 +154,10 @@ class ChunkEval(Metric):
int
(
math
.
ceil
((
num_labels
-
1
)
/
2.0
)),
"IOB"
)
self
.
reset
()
def
add_metric_op
(
self
,
pred
,
label
,
*
args
,
**
kwargs
):
crf_decode
=
pred
[
0
]
lengths
=
pred
[
2
]
def
add_metric_op
(
self
,
*
args
):
crf_decode
=
args
[
0
]
lengths
=
args
[
2
]
label
=
args
[
3
]
(
num_infer_chunks
,
num_label_chunks
,
num_correct_chunks
)
=
self
.
chunk_eval
(
input
=
crf_decode
,
label
=
label
,
seq_length
=
lengths
)
...
...
@@ -204,11 +205,11 @@ def main(args):
place
=
set_device
(
args
.
device
)
fluid
.
enable_dygraph
(
place
)
if
args
.
dynamic
else
None
inputs
=
[
Input
([
None
,
args
.
max_seq_len
],
'int64'
,
name
=
'words'
),
inputs
=
[
Input
([
None
,
None
],
'int64'
,
name
=
'words'
),
Input
([
None
],
'int64'
,
name
=
'length'
),
Input
([
None
,
args
.
max_seq_len
],
'int64'
,
name
=
'target'
)]
Input
([
None
,
None
],
'int64'
,
name
=
'target'
)]
labels
=
[
Input
([
None
,
args
.
max_seq_len
],
'int64'
,
name
=
'labels'
)]
labels
=
[
Input
([
None
,
None
],
'int64'
,
name
=
'labels'
)]
feed_list
=
None
if
args
.
dynamic
else
[
x
.
forward
()
for
x
in
inputs
+
labels
]
dataset
=
LacDataset
(
args
)
...
...
@@ -343,7 +344,8 @@ if __name__ == '__main__':
args
=
parser
.
parse_args
()
print
(
args
)
check_gpu
(
args
.
device
)
use_gpu
=
True
if
args
.
device
==
"gpu"
else
False
check_gpu
(
use_gpu
)
check_version
()
main
(
args
)
sequence_tagging/train.sh
已删除
100644 → 0
浏览文件 @
dc437431
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3
python
-m
paddle.distributed.launch
--selected_gpus
=
0,1,2,3 train.py
\
--train_file
./data/train.tsv
\
--test_file
./data/test.tsv
\
--word_dict_path
./data/word.dic
\
--label_dict_path
./data/tag.dic
\
--word_rep_dict_path
./data/q2b.dic
\
--device
gpu
\
--grnn_hidden_dim
128
\
--word_emb_dim
128
\
--bigru_num
2
\
--base_learning_rate
1e-3
\
--batch_size
300
\
--epoch
10
\
--save_dir
./model
\
-d
sequence_tagging/utils/metrics.py
0 → 100644
浏览文件 @
50816a2d
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
sys
import
paddle.fluid
as
fluid
__all__
=
[
'chunk_count'
,
"build_chunk"
]
def
build_chunk
(
data_list
,
id2label_dict
):
"""
Assembly entity
"""
tag_list
=
[
id2label_dict
.
get
(
str
(
id
))
for
id
in
data_list
]
ner_dict
=
{}
ner_str
=
""
ner_start
=
0
for
i
in
range
(
len
(
tag_list
)):
tag
=
tag_list
[
i
]
if
tag
==
u
"O"
:
if
i
!=
0
:
key
=
"%d_%d"
%
(
ner_start
,
i
-
1
)
ner_dict
[
key
]
=
ner_str
ner_start
=
i
ner_str
=
tag
elif
tag
.
endswith
(
u
"B"
):
if
i
!=
0
:
key
=
"%d_%d"
%
(
ner_start
,
i
-
1
)
ner_dict
[
key
]
=
ner_str
ner_start
=
i
ner_str
=
tag
.
split
(
'-'
)[
0
]
elif
tag
.
endswith
(
u
"I"
):
if
tag
.
split
(
'-'
)[
0
]
!=
ner_str
:
if
i
!=
0
:
key
=
"%d_%d"
%
(
ner_start
,
i
-
1
)
ner_dict
[
key
]
=
ner_str
ner_start
=
i
ner_str
=
tag
.
split
(
'-'
)[
0
]
return
ner_dict
def
chunk_count
(
infer_numpy
,
label_numpy
,
seq_len
,
id2label_dict
):
"""
calculate num_correct_chunks num_error_chunks total_num for metrics
"""
num_infer_chunks
,
num_label_chunks
,
num_correct_chunks
=
0
,
0
,
0
assert
infer_numpy
.
shape
[
0
]
==
label_numpy
.
shape
[
0
]
for
i
in
range
(
infer_numpy
.
shape
[
0
]):
infer_list
=
infer_numpy
[
i
][:
seq_len
[
i
]]
label_list
=
label_numpy
[
i
][:
seq_len
[
i
]]
infer_dict
=
build_chunk
(
infer_list
,
id2label_dict
)
num_infer_chunks
+=
len
(
infer_dict
)
label_dict
=
build_chunk
(
label_list
,
id2label_dict
)
num_label_chunks
+=
len
(
label_dict
)
for
key
in
infer_dict
:
if
key
in
label_dict
and
label_dict
[
key
]
==
infer_dict
[
key
]:
num_correct_chunks
+=
1
return
num_infer_chunks
,
num_label_chunks
,
num_correct_chunks
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录