提交 9e2c2b34 编写于 作者: Z Zeyu Chen

remove useless code of ernie finetuning

上级 e0de0d86
......@@ -36,7 +36,6 @@ parser.add_argument("--data_dir", type=str, default=None, help="Path to training
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.")
parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.")
args = parser.parse_args()
# yapf: enable.
......@@ -55,9 +54,7 @@ if __name__ == '__main__':
# loading Paddlehub BERT
module = hub.Module(module_dir=args.hub_module_dir)
# Use BERTTokenizeReader to tokenize the dataset according to model's
# vocabulary
reader = hub.reader.BERTTokenizeReader(
reader = hub.reader.ClassifyReader(
dataset=hub.dataset.ChnSentiCorp(), # download chnsenticorp dataset
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
......
export CUDA_VISIBLE_DEVICES=5
export CUDA_VISIBLE_DEVICES=3
DATA_PATH=./chnsenticorp_data
HUB_MODULE_DIR="./hub_module/bert_chinese_L-12_H-768_A-12.hub_module"
#HUB_MODULE_DIR="./hub_module/ernie_stable.hub_module"
CKPT_DIR="./ckpt"
#rm -rf $CKPT_DIR
python -u finetune_with_hub.py \
--batch_size 32 \
--hub_module_dir=$HUB_MODULE_DIR \
--data_dir ${DATA_PATH} \
--weight_decay 0.01 \
--checkpoint_dir $CKPT_DIR \
--num_epoch 3 \
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Finetuning on classification tasks."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
import argparse
import numpy as np
import paddle
import paddle.fluid as fluid
import paddlehub as hub
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.")
parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
parser.add_argument("--hub_module_dir", type=str, default=None, help="PaddleHub module directory")
parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.")
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.")
parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.")
args = parser.parse_args()
# yapf: enable.
if __name__ == '__main__':
strategy = hub.BERTFinetuneStrategy(
weight_decay=args.weight_decay,
learning_rate=args.learning_rate,
warmup_strategy="linear_warmup_decay",
)
config = hub.RunConfig(
eval_interval=100,
use_cuda=True,
num_epoch=args.num_epoch,
batch_size=args.batch_size,
strategy=strategy)
# loading Paddlehub ERNIE
module = hub.Module(name="ernie")
reader = hub.reader.SequenceLabelReader(
dataset=hub.dataset.MSRA_NER(),
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
num_labels = len(reader.get_labels())
input_dict, output_dict, program = module.context(
sign_name="tokens", trainable=True, max_seq_len=args.max_seq_len)
with fluid.program_guard(program):
label = fluid.layers.data(
name="label", shape=[args.max_seq_len, 1], dtype='int64')
seq_len = fluid.layers.data(name="seq_len", shape=[1], dtype='int64')
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
sequence_output = output_dict["sequence_output"]
# Setup feed list for data feeder
# Must feed all the tensor of bert's module need
feed_list = [
input_dict["input_ids"].name, input_dict["position_ids"].name,
input_dict["segment_ids"].name, input_dict["input_mask"].name,
label.name, seq_len
]
# Define a classfication finetune task by PaddleHub's API
seq_label_task = hub.append_sequence_labeler(
feature=sequence_output,
labels=label,
seq_len=seq_len,
num_classes=num_labels)
# Finetune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
hub.finetune_and_eval(
task=seq_label_task,
data_reader=reader,
feed_list=feed_list,
config=config)
export CUDA_VISIBLE_DEVICES=3
CKPT_DIR="./ckpt"
python -u finetune_with_hub.py \
--batch_size 16 \
--weight_decay 0.01 \
--checkpoint_dir $CKPT_DIR \
--num_epoch 3 \
--max_seq_len 256 \
--learning_rate 5e-5
......@@ -33,6 +33,7 @@ from .module.manager import default_module_manager
from .io.type import DataType
from .finetune.network import append_mlp_classifier
from .finetune.network import append_sequence_labeler
from .finetune.finetune import finetune_and_eval
from .finetune.config import RunConfig
from .finetune.task import Task
......
......@@ -30,6 +30,8 @@ class MSRA_NER(object):
self._load_label_map()
self._load_train_examples()
self._load_test_examples()
self._load_dev_examples()
def _load_label_map(self):
self.label_map_file = os.path.join(self.dataset_dir, "label_map.json")
......@@ -40,12 +42,28 @@ class MSRA_NER(object):
train_file = os.path.join(self.dataset_dir, "train.tsv")
self.train_examples = self._read_tsv(train_file)
def _load_dev_examples(self):
self.dev_file = os.path.join(self.dataset_dir, "dev.tsv")
self.dev_examples = self._read_tsv(self.dev_file)
def _load_test_examples(self):
self.test_file = os.path.join(self.dataset_dir, "test.tsv")
self.test_examples = self._read_tsv(self.test_file)
def get_train_examples(self):
return self.train_examples
def get_dev_examples(self):
return self.dev_examples
def get_test_examples(self):
return self.test_examples
def get_labels(self):
"""See base class."""
return ["0", "1"]
return ["B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O"]
def get_label_map(self):
return self.label_map
def _read_tsv(self, input_file, quotechar=None):
"""Reads a tab separated value file."""
......
......@@ -22,6 +22,7 @@ import multiprocessing
import paddle
import paddle.fluid as fluid
import numpy as np
from paddlehub.common.logger import logger
from paddlehub.finetune.strategy import BERTFinetuneStrategy, DefaultStrategy
......@@ -61,7 +62,149 @@ def _do_memory_optimization(task, config):
(lower_mem, upper_mem, unit)),
def _finetune_model(task, data_reader, feed_list, config=None, do_eval=False):
def _finetune_seq_label_task(task,
data_reader,
feed_list,
config=None,
do_eval=False):
"""
Finetune sequence labeling task, evaluate metric is F1, precision and recall
"""
main_program = task.main_program()
startup_program = task.startup_program()
loss = task.variable("loss")
seq_len = task.variable("seq_len")
num_epoch = config.num_epoch
batch_size = config.batch_size
place, dev_count = _get_running_device_info(config)
with fluid.program_guard(main_program, startup_program):
exe = fluid.Executor(place=place)
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
# Select strategy
if isinstance(config.strategy, hub.BERTFinetuneStrategy):
scheduled_lr = config.strategy.execute(loss, main_program,
data_reader, config)
elif isinstance(config.strategy, hub.DefaultStrategy):
config.strategy.execute(loss)
#TODO: add more finetune strategy
_do_memory_optimization(task, config)
# Try to restore model training checkpoint
current_epoch, global_step = load_checkpoint(config.checkpoint_dir, exe)
train_time_used = 0
logger.info("PaddleHub finetune start")
# Finetune loop
for epoch in range(current_epoch, num_epoch + 1):
train_reader = data_reader.data_generator(
batch_size=batch_size, phase='train')
num_trained_examples = loss_sum = 0
for batch in train_reader():
num_batch_examples = len(batch)
train_time_begin = time.time()
loss_v = exe.run(
feed=data_feeder.feed(batch), fetch_list=[loss.name])
train_time_used += time.time() - train_time_begin
global_step += 1
num_trained_examples += num_batch_examples
loss_sum += loss_v[0] * num_batch_examples
# log fintune status
if global_step % config.log_interval == 0:
avg_loss = loss_sum / num_trained_examples
speed = config.log_interval / train_time_used
logger.info("step %d: loss=%.5f [step/sec: %.2f]" %
(global_step, avg_loss, speed))
train_time_used = 0
num_trained_examples = loss_sum = 0
if config.save_ckpt_interval and global_step % config.save_ckpt_interval == 0:
# NOTE: current saved checkpoint machanism is not completed,
# it can't restore correct dataset training status
save_checkpoint(
checkpoint_dir=config.checkpoint_dir,
current_epoch=epoch,
global_step=global_step,
exe=exe)
if do_eval and global_step % config.eval_interval == 0:
evaluate_seq_label(
task,
data_reader,
feed_list,
phase="dev",
config=config)
evaluate_seq_label(
task,
data_reader,
feed_list,
phase="test",
config=config)
# NOTE: current saved checkpoint machanism is not completed, it can't
# resotre dataset training status
save_checkpoint(
checkpoint_dir=config.checkpoint_dir,
current_epoch=num_epoch + 1,
global_step=global_step,
exe=exe)
if do_eval:
evaluate_seq_label(
task, data_reader, feed_list, phase="test", config=config)
logger.info("PaddleHub finetune finished.")
def evaluate_seq_label(task, data_reader, feed_list, phase="test", config=None):
fetch_list = [
task.variable("labels").name,
task.variable("infers").name,
task.variable("seq_len").name,
task.variable("loss").name
]
logger.info("Evaluation on {} dataset start".format(phase))
inference_program = task.inference_program()
batch_size = config.batch_size
place, dev_count = _get_running_device_info(config)
exe = fluid.Executor(place=place)
with fluid.program_guard(inference_program):
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
num_eval_examples = acc_sum = loss_sum = 0
test_reader = data_reader.data_generator(
batch_size=batch_size, phase=phase)
eval_time_begin = time.time()
eval_step = 0
total_label, total_infer, total_correct = 0.0, 0.0, 0.0
for batch in test_reader():
num_batch_examples = len(batch)
eval_step += 1
np_labels, np_infers, np_lens, _ = exe.run(
feed=data_feeder.feed(batch), fetch_list=fetch_list)
label_num, infer_num, correct_num = chunk_eval(
np_labels, np_infers, np_lens, 7, dev_count)
total_infer += infer_num
total_label += label_num
total_correct += correct_num
precision, recall, f1 = calculate_f1(total_label, total_infer,
total_correct)
eval_time_used = time.time() - eval_time_begin
eval_speed = eval_step / eval_time_used
logger.info(
"[%s evaluation] F1-Score=%f, precision=%f, recall=%f [step/sec: %.2f]"
% (phase, f1, precision, recall, eval_speed))
def _finetune_cls_task(task, data_reader, feed_list, config=None,
do_eval=False):
main_program = task.main_program()
startup_program = task.startup_program()
loss = task.variable("loss")
......@@ -175,11 +318,15 @@ def _finetune_model(task, data_reader, feed_list, config=None, do_eval=False):
def finetune_and_eval(task, data_reader, feed_list, config=None):
_finetune_model(task, data_reader, feed_list, config, do_eval=True)
if task.task_type == "sequence_labeling":
_finetune_seq_label_task(
task, data_reader, feed_list, config, do_eval=True)
else:
_finetune_cls_task(task, data_reader, feed_list, config, do_eval=True)
def finetune(task, data_reader, feed_list, config=None):
_finetune_model(task, data_reader, feed_list, config, do_eval=False)
_finetune_cls_task(task, data_reader, feed_list, config, do_eval=False)
def evaluate(task, data_reader, feed_list, phase="test", config=None):
......@@ -217,3 +364,107 @@ def evaluate(task, data_reader, feed_list, phase="test", config=None):
(phase, avg_loss, avg_acc, eval_speed))
return avg_loss, avg_acc, eval_speed
# Sequence label evaluation functions
def chunk_eval(np_labels, np_infers, np_lens, tag_num, dev_count=1):
def extract_bio_chunk(seq):
chunks = []
cur_chunk = None
null_index = tag_num - 1
for index in range(len(seq)):
tag = seq[index]
tag_type = tag // 2
tag_pos = tag % 2
if tag == null_index:
if cur_chunk is not None:
chunks.append(cur_chunk)
cur_chunk = None
continue
if tag_pos == 0:
if cur_chunk is not None:
chunks.append(cur_chunk)
cur_chunk = {}
cur_chunk = {"st": index, "en": index + 1, "type": tag_type}
else:
if cur_chunk is None:
cur_chunk = {"st": index, "en": index + 1, "type": tag_type}
continue
if cur_chunk["type"] == tag_type:
cur_chunk["en"] = index + 1
else:
chunks.append(cur_chunk)
cur_chunk = {"st": index, "en": index + 1, "type": tag_type}
if cur_chunk is not None:
chunks.append(cur_chunk)
return chunks
null_index = tag_num - 1
num_label = 0
num_infer = 0
num_correct = 0
labels = np_labels.reshape([-1]).astype(np.int32).tolist()
infers = np_infers.reshape([-1]).astype(np.int32).tolist()
all_lens = np_lens.reshape([dev_count, -1]).astype(np.int32).tolist()
base_index = 0
for dev_index in range(dev_count):
lens = all_lens[dev_index]
max_len = 0
for l in lens:
max_len = max(max_len, l)
for i in range(len(lens)):
seq_st = base_index + i * max_len + 1
seq_en = seq_st + (lens[i] - 2)
infer_chunks = extract_bio_chunk(infers[seq_st:seq_en])
label_chunks = extract_bio_chunk(labels[seq_st:seq_en])
num_infer += len(infer_chunks)
num_label += len(label_chunks)
infer_index = 0
label_index = 0
while label_index < len(label_chunks) \
and infer_index < len(infer_chunks):
if infer_chunks[infer_index]["st"] \
< label_chunks[label_index]["st"]:
infer_index += 1
elif infer_chunks[infer_index]["st"] \
> label_chunks[label_index]["st"]:
label_index += 1
else:
if infer_chunks[infer_index]["en"] \
== label_chunks[label_index]["en"] \
and infer_chunks[infer_index]["type"] \
== label_chunks[label_index]["type"]:
num_correct += 1
infer_index += 1
label_index += 1
base_index += max_len * len(lens)
return num_label, num_infer, num_correct
def calculate_f1(num_label, num_infer, num_correct):
if num_infer == 0:
precision = 0.0
else:
precision = num_correct * 1.0 / num_infer
if num_label == 0:
recall = 0.0
else:
recall = num_correct * 1.0 / num_label
if num_correct == 0:
f1 = 0.0
else:
f1 = 2 * precision * recall / (precision + recall)
return precision, recall, f1
......@@ -55,7 +55,6 @@ def append_mlp_classifier(feature, label, num_classes=2, hidden_units=None):
accuracy = fluid.layers.accuracy(
input=probs, label=label, total=num_example)
# TODO: encapsulate to Task
graph_var_dict = {
"loss": loss,
"probs": probs,
......@@ -77,5 +76,40 @@ def append_mlp_multi_classifier(feature,
pass
def append_sequence_labler(feature, label):
pass
def append_sequence_labeler(feature, labels, seq_len, num_classes=None):
logits = fluid.layers.fc(
input=feature,
size=num_classes,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name="cls_seq_label_out_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_seq_label_out_b",
initializer=fluid.initializer.Constant(0.)))
ret_labels = fluid.layers.reshape(x=labels, shape=[-1, 1])
ret_infers = fluid.layers.reshape(
x=fluid.layers.argmax(logits, axis=2), shape=[-1, 1])
labels = fluid.layers.flatten(labels, axis=2)
ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
logits=fluid.layers.flatten(logits, axis=2),
label=labels,
return_softmax=True)
loss = fluid.layers.mean(x=ce_loss)
# accuracy = fluid.layers.accuracy(
# input=probs, label=labels, total=num_example)
graph_var_dict = {
"loss": loss,
"probs": probs,
"labels": ret_labels,
"infers": ret_infers,
"seq_len": seq_len
}
task = Task("sequence_labeling", graph_var_dict,
fluid.default_main_program(), fluid.default_startup_program())
return task
......@@ -13,3 +13,5 @@
# limitations under the License.
from .nlp_reader import BERTTokenizeReader
from .task_reader import ClassifyReader
from .task_reader import SequenceLabelReader
......@@ -20,63 +20,8 @@ from __future__ import print_function
import numpy as np
def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
"""
Add mask for batch_tokens, return out, mask_label, mask_pos;
Note: mask_pos responding the batch_tokens after padded;
"""
max_len = max([len(sent) for sent in batch_tokens])
mask_label = []
mask_pos = []
prob_mask = np.random.rand(total_token_num)
# Note: the first token is [CLS], so [low=1]
replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
pre_sent_len = 0
prob_index = 0
for sent_index, sent in enumerate(batch_tokens):
mask_flag = False
prob_index += pre_sent_len
for token_index, token in enumerate(sent):
prob = prob_mask[prob_index + token_index]
if prob > 0.15:
continue
elif 0.03 < prob <= 0.15:
# mask
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
elif 0.015 < prob <= 0.03:
# random replace
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = replace_ids[prob_index + token_index]
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
else:
# keep the original token
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
mask_pos.append(sent_index * max_len + token_index)
pre_sent_len = len(sent)
# ensure at least mask one word in a sentence
while not mask_flag:
token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
if sent[token_index] != SEP and sent[token_index] != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
return batch_tokens, mask_label, mask_pos
def prepare_batch_data(insts,
total_token_num,
voc_size=0,
max_seq_len=128,
pad_id=None,
cls_id=None,
......@@ -103,17 +48,7 @@ def prepare_batch_data(insts,
labels = np.array(labels).astype("int64").reshape([-1, 1])
labels_list.append(labels)
# First step: do mask without padding
if mask_id >= 0:
out, mask_label, mask_pos = mask(
batch_src_ids,
total_token_num,
vocab_size=voc_size,
CLS=cls_id,
SEP=sep_id,
MASK=mask_id)
else:
out = batch_src_ids
out = batch_src_ids
# Second step: padding
src_id, self_input_mask = pad_batch_data(
out, pad_idx=pad_id, max_seq_len=max_seq_len, return_input_mask=True)
......@@ -130,12 +65,7 @@ def prepare_batch_data(insts,
return_pos=False,
return_input_mask=False)
if mask_id >= 0:
return_list = [
src_id, pos_id, sent_id, self_input_mask, mask_label, mask_pos
] + labels_list
else:
return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list
return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list
return return_list if len(return_list) > 1 else return_list[0]
......@@ -146,7 +76,8 @@ def pad_batch_data(insts,
return_pos=False,
return_input_mask=False,
return_max_len=False,
return_num_token=False):
return_num_token=False,
return_seq_lens=False):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and input mask.
......@@ -187,4 +118,8 @@ def pad_batch_data(insts,
num_token += len(inst)
return_list += [num_token]
if return_seq_lens:
seq_lens = np.array([len(inst) for inst in insts])
return_list += [seq_lens.astype("int64").reshape([-1, 1])]
return return_list if len(return_list) > 1 else return_list[0]
......@@ -83,20 +83,16 @@ class BERTTokenizeReader(object):
def generate_batch_data(self,
batch_data,
total_token_num,
voc_size=-1,
mask_id=-1,
return_input_mask=True,
return_max_len=False,
return_num_token=False):
return prepare_batch_data(
batch_data,
total_token_num,
voc_size=-1,
max_seq_len=self.max_seq_len,
pad_id=self.vocab["[PAD]"],
cls_id=self.vocab["[CLS]"],
sep_id=self.vocab["[SEP]"],
mask_id=-1,
return_input_mask=return_input_mask,
return_max_len=return_max_len,
return_num_token=return_num_token)
......@@ -166,8 +162,6 @@ class BERTTokenizeReader(object):
batch_data = self.generate_batch_data(
batch_data,
total_token_num,
voc_size=-1,
mask_id=-1,
return_input_mask=True,
return_max_len=True,
return_num_token=False)
......
......@@ -11,6 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Store PaddleHub version string """
""" PaddleHub version string """
hub_version = "0.3.1.alpha"
module_proto_version = "0.1.0"
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册