提交 33df1dc6 编写于 作者: Z zhanghan17

ernie-gram release demos

上级 6bc8cd33
......@@ -19,12 +19,13 @@ from __future__ import unicode_literals
import json
import logging
import math
import six
if six.PY2:
from pathlib2 import Path
else:
from pathlib import Path
import numpy as np
import paddle as P
from paddle import nn
from paddle.nn import functional as F
......@@ -36,7 +37,35 @@ ACT_DICT = {
'relu': nn.ReLU,
'gelu': nn.GELU,
}
def _get_rel_pos_bias(seq_len, max_len=128, num_buckets=32, bidirectional=True, reset=True):
#max_len = 520
pos = np.array(range(seq_len))
rel_pos = pos[:, None] - pos[None, :]
ret = 0
n = -rel_pos
if bidirectional:
num_buckets //= 2
ret += (n < 0).astype('int32') * num_buckets # mtf.to_int32(mtf.less(n, 0)) * num_buckets
n = np.abs(n)
else:
n = np.max(n, np.zeros_like(n))
# now n is in the range [0, inf)
# half of the buckets are for exact increments in positions
max_exact = num_buckets // 2
is_small = n < max_exact
# The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
val_if_large = max_exact + (np.log(n.astype('float32') / max_exact) / math.log(max_len / max_exact) * (num_buckets - max_exact)).astype('int32')
tmp = np.full_like(val_if_large, num_buckets-1)
val_if_large = np.where(val_if_large < tmp, val_if_large, tmp)
ret += np.where(is_small, n, val_if_large)
if reset:
num_buckets *= 2
ret[:, 0] = num_buckets
ret[0, :] = num_buckets // 2
return np.array(ret).reshape([seq_len, seq_len]).astype("int64")
def _build_linear(n_in, n_out, name, init):
return nn.Linear(
......@@ -223,6 +252,8 @@ class PretrainedModel(object):
'ernie-2.0-en': bce + 'model-ernie2.0-en.1.tar.gz',
'ernie-2.0-large-en': bce + 'model-ernie2.0-large-en.1.tar.gz',
'ernie-tiny': bce + 'model-ernie_tiny.1.tar.gz',
'ernie-gram-zh': bce + 'model-ernie-gram-zh.1.tar.gz',
'ernie-gram-en': bce + 'model-ernie-gram-en.1.tar.gz',
}
@classmethod
......@@ -283,10 +314,14 @@ class ErnieModel(nn.Layer, PretrainedModel):
d_vocab = cfg['vocab_size']
d_pos = cfg['max_position_embeddings']
d_sent = cfg.get("sent_type_vocab_size") or cfg['type_vocab_size']
self.d_rel_pos = cfg.get('rel_pos_size', None)
max_seq_len = cfg.get("max_seq_len", 512)
self.n_head = cfg['num_attention_heads']
self.return_additional_info = cfg.get('return_additional_info', False)
initializer = nn.initializer.TruncatedNormal(
std=cfg['initializer_range'])
if self.d_rel_pos:
self.rel_pos_bias = _get_rel_pos_bias(max_seq_len)
self.ln = _build_ln(d_model, name=append_name(name, 'pre_encoder'))
self.word_emb = nn.Embedding(
......@@ -307,6 +342,13 @@ class ErnieModel(nn.Layer, PretrainedModel):
weight_attr=P.ParamAttr(
name=append_name(name, 'sent_embedding'),
initializer=initializer))
if self.d_rel_pos:
self.rel_pos_bias_emb = nn.Embedding(
self.d_rel_pos,
self.n_head,
weight_attr=P.ParamAttr(
name=append_name(name, 'rel_pos_embedding'),
initializer=initializer))
prob = cfg['hidden_dropout_prob']
self.dropout = nn.Dropout(p=prob)
......@@ -347,6 +389,7 @@ class ErnieModel(nn.Layer, PretrainedModel):
attn_bias=None,
past_cache=None,
use_causal_mask=False):
"""
Args:
src_ids (`Variable` of shape `[batch_size, seq_len]`):
......@@ -402,15 +445,20 @@ class ErnieModel(nn.Layer, PretrainedModel):
attn_bias = (1. - attn_bias) * -10000.0
attn_bias = attn_bias.unsqueeze(1).tile(
[1, self.n_head, 1, 1]) # avoid broadcast =_=
attn_bias.stop_gradient=True
if sent_ids is None:
sent_ids = P.zeros_like(src_ids)
if self.d_rel_pos:
rel_pos_ids = self.rel_pos_bias[:d_seqlen, :d_seqlen]
rel_pos_ids = P.to_tensor(rel_pos_ids, dtype='int64')
rel_pos_bias = self.rel_pos_bias_emb(rel_pos_ids).transpose([2, 0, 1])
attn_bias += rel_pos_bias
src_embedded = self.word_emb(src_ids)
pos_embedded = self.pos_emb(pos_ids)
sent_embedded = self.sent_emb(sent_ids)
embedded = src_embedded + pos_embedded + sent_embedded
embedded = self.dropout(self.ln(embedded))
encoded, hidden_list, cache_list = self.encoder_stack(
......
......@@ -87,6 +87,8 @@ class ErnieTokenizer(object):
'ernie-tiny': bce + 'model-ernie_tiny.1.tar.gz',
'ernie-gen-base-en': bce + 'model-ernie-gen-base-en.1.tar.gz',
'ernie-gen-large-en': bce + 'model-ernie-gen-large-en.1.tar.gz',
'ernie-gram-zh': bce + 'model-ernie-gram-zh.1.tar.gz',
'ernie-gram-en': bce + 'model-ernie-gram-en.1.tar.gz',
}
@classmethod
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import time
import logging
import json
import re
from random import random
from functools import reduce, partial
import numpy as np
import logging
#from visualdl import LogWriter
from pathlib import Path
import paddle as P
from propeller import log
import propeller.paddle as propeller
#from model.bert import BertConfig, BertModelLayer
from ernie.modeling_ernie import ErnieModel, ErnieModelForSequenceClassification
from ernie.tokenizing_ernie import ErnieTokenizer, ErnieTinyTokenizer
from ernie_gram.optimization import AdamW
from ernie_gram.utils import create_if_not_exists, get_warmup_and_linear_decay
log.setLevel(logging.DEBUG)
logging.getLogger().setLevel(logging.DEBUG)
parser = propeller.ArgumentParser('classify model with ERNIE')
parser.add_argument(
'--from_pretrained',
type=Path,
required=True,
help='pretrained model directory or tag')
parser.add_argument(
'--max_seqlen',
type=int,
default=128,
help='max sentence length, should not greater than 512')
parser.add_argument('--bsz', type=int, default=32, help='batchsize')
parser.add_argument(
'--data_dir',
type=str,
required=True,
help='data directory includes train / develop data')
parser.add_argument(
'--max_steps',
type=int,
required=True,
help='max_train_steps, set this to EPOCH * NUM_SAMPLES / BATCH_SIZE')
parser.add_argument('--warmup_proportion', type=float, default=0.1)
parser.add_argument('--lr', type=float, default=5e-5, help='learning rate')
parser.add_argument('--lr_decay', type=float, default=0.8, help='layerwise learning decay rate')
parser.add_argument('--decay_layers', type=float, default=12, help='number of layers for layerwise learning decay')
parser.add_argument('--label_map', type=str, default="", help='str to int')
parser.add_argument('--num_labels', type=int, default=2, help='number of labels')
parser.add_argument('--valid_steps', type=int, default=100, help='The steps interval to evaluate model performance.')
parser.add_argument('--pair_input', type=int, default=0, help='is sentence pair task or not')
parser.add_argument(
'--save_dir', type=Path, required=True, help='model output directory')
parser.add_argument(
'--wd', type=float, default=0.01, help='weight decay, aka L2 regularizer')
parser.add_argument(
'--init_checkpoint',
type=str,
default=None,
help='checkpoint to warm start from')
parser.add_argument(
'--use_amp',
action='store_true',
help='only activate AMP(auto mixed precision accelatoin) on TensorCore compatible devices'
)
args = parser.parse_args()
env = P.distributed.ParallelEnv()
tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained)
#tokenizer = ErnieTinyTokenizer.from_pretrained(args.from_pretrained)
if args.label_map:
label_map = {k.encode(): v for k, v in json.loads(args.label_map).items()}
else:
label_map = {str(l).encode(): l for l in range(args.num_labels)}
text_col_names = ["seg_a", "seg_b"] if args.pair_input else ["seg_a"]
feature_column = propeller.data.FeatureColumns([
propeller.data.TextColumn(
col_name,
unk_id=tokenizer.unk_id,
vocab_dict=tokenizer.vocab,
tokenizer=tokenizer.tokenize) for col_name in text_col_names] + [
propeller.data.LabelColumn(
'label',
vocab_dict=label_map),
])
def map_fn_pair(seg_a, seg_b, label):
seg_a, seg_b = tokenizer.truncate(seg_a, seg_b, seqlen=args.max_seqlen)
sentence, segments = tokenizer.build_for_ernie(seg_a, seg_b)
return sentence, segments, label
def map_fn_single(seg_a, label):
seg_a, _ = tokenizer.truncate(seg_a, [], seqlen=args.max_seqlen)
sentence, segments = tokenizer.build_for_ernie(seg_a, [])
return sentence, segments, label
map_fn = map_fn_pair if args.pair_input else map_fn_single
train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'),
shuffle=True, repeat=True, use_gz=False, shard=True) \
.map(map_fn) \
.padded_batch(args.bsz, (0, 0, 0))
dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'),
shuffle=False, repeat=False, use_gz=False) \
.map(map_fn) \
.padded_batch(args.bsz, (0, 0, 0))
test_ds = feature_column.build_dataset('test', data_dir=os.path.join(args.data_dir, 'test'),
shuffle=False, repeat=False, use_gz=False) \
.map(map_fn) \
.padded_batch(args.bsz, (0, 0, 0))
shapes = ([-1, args.max_seqlen], [-1, args.max_seqlen], [-1])
types = ('int64', 'int64', 'int64')
P.distributed.init_parallel_env()
model = ErnieModelForSequenceClassification.from_pretrained(
args.from_pretrained, num_labels=args.num_labels, name='')
if args.init_checkpoint is not None:
log.info('loading checkpoint from %s' % args.init_checkpoint)
sd = P.load(args.init_checkpoint)
model.set_state_dict(sd)
model = P.DataParallel(model)
g_clip = P.nn.ClipGradByGlobalNorm(1.0) #experimental
param_name_to_exclue_from_weight_decay = re.compile(
r'.*layer_norm_scale|.*layer_norm_bias|.*b_0')
lr_scheduler = P.optimizer.lr.LambdaDecay(
args.lr,
get_warmup_and_linear_decay(args.max_steps,
int(args.warmup_proportion * args.max_steps)))
opt = AdamW(
learning_rate=lr_scheduler,
parameters=model.parameters(),
apply_decay_param_fun=lambda n: not param_name_to_exclue_from_weight_decay.match(n),
weight_decay=args.wd,
grad_clip=g_clip,
layerwise_lr_decay_rate=args.lr_decay,
n_layers=args.decay_layers)
scaler = P.amp.GradScaler(enable=args.use_amp)
step = 0
create_if_not_exists(args.save_dir)
#with LogWriter(logdir=str(create_if_not_exists(args.save_dir / 'vdl-%d' % env.dev_id))) as log_writer:
with P.amp.auto_cast(enable=args.use_amp):
for ids, sids, label in P.io.DataLoader(
train_ds, places=P.CUDAPlace(env.dev_id), batch_size=None):
step += 1
loss, _ = model(ids, sids, labels=label)
loss = scaler.scale(loss)
loss.backward()
scaler.minimize(opt, loss)
model.clear_gradients()
lr_scheduler.step()
# do logging
if step % 10 == 0:
_lr = lr_scheduler.get_lr()
if args.use_amp:
_l = (loss / scaler._scale).numpy()
msg = '[rank-%d][step-%d] train loss %.5f lr %.3e scaling %.3e' % (
env.dev_id, step, _l, _lr, scaler._scale.numpy())
else:
_l = loss.numpy()
msg = '[rank-%d][step-%d] train loss %.5f lr %.3e' % (
env.dev_id, step, _l, _lr)
log.debug(msg)
#log_writer.add_scalar('loss', _l, step=step)
#log_writer.add_scalar('lr', _lr, step=step)
# do saving
if step % args.valid_steps == 0 and env.dev_id == 0:
acc = []
with P.no_grad():
model.eval()
for d in P.io.DataLoader(
dev_ds, places=P.CUDAPlace(env.dev_id),
batch_size=None):
ids, sids, label = d
loss, logits = model(ids, sids, labels=label)
a = (logits.argmax(-1) == label)
acc.append(a.numpy())
model.train()
acc = np.concatenate(acc).mean()
#log_writer.add_scalar('eval/acc', acc, step=step)
log.debug('dev acc %.5f' % acc)
acc = []
with P.no_grad():
model.eval()
for d in P.io.DataLoader(
test_ds, places=P.CUDAPlace(env.dev_id),
batch_size=None):
ids, sids, label = d
loss, logits = model(ids, sids, labels=label)
a = (logits.argmax(-1) == label)
acc.append(a.numpy())
model.train()
acc = np.concatenate(acc).mean()
#log_writer.add_scalar('eval/acc', acc, step=step)
log.debug('test acc %.5f' % acc)
if args.save_dir is not None:
P.save(model.state_dict(), args.save_dir / 'ckpt.bin')
# exit
if step > args.max_steps:
break
if args.save_dir is not None and env.dev_id == 0:
P.save(model.state_dict(), args.save_dir / 'ckpt.bin')
log.debug('done')
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import division
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals
import os
import re
import time
import logging
import json
from pathlib import Path
from random import random
from tqdm import tqdm
from functools import reduce, partial
import pickle
import argparse
from functools import partial
from io import open
import numpy as np
import logging
import paddle as P
from propeller import log
import propeller.paddle as propeller
from ernie_gram.optimization import AdamW
from ernie.modeling_ernie import ErnieModel, ErnieModelForQuestionAnswering
from ernie.tokenizing_ernie import ErnieTokenizer, ErnieTinyTokenizer
#from ernie.optimization import AdamW, LinearDecay
from ernie_gram.mrc import mrc_reader
from ernie_gram.mrc import mrc_metrics
from ernie_gram.utils import create_if_not_exists, get_warmup_and_linear_decay
log.setLevel(logging.DEBUG)
logging.getLogger().setLevel(logging.DEBUG)
def evaluate(model, ds, all_examples, all_features, tokenizer, args):
dev_file = json.loads(open(args.dev_file, encoding='utf8').read())
with P.no_grad():
log.debug('start eval')
model.eval()
all_res = []
for step, (uids, token_ids, token_type_ids, _, __) in enumerate(
P.io.DataLoader(
ds, places=P.CUDAPlace(env.dev_id), batch_size=None)):
_, start_logits, end_logits = model(token_ids, token_type_ids)
res = [
mrc_metrics.RawResult(
unique_id=u, start_logits=s, end_logits=e)
for u, s, e in zip(uids.numpy(),
start_logits.numpy(), end_logits.numpy())
]
all_res += res
open('all_res', 'wb').write(pickle.dumps(all_res))
all_pred, all_nbests = mrc_metrics.make_results(
tokenizer,
all_examples,
all_features,
all_res,
n_best_size=args.n_best_size,
max_answer_length=args.max_answer_length,
do_lower_case=tokenizer.lower)
f1, em, _, __ = mrc_metrics.evaluate(dev_file, all_pred)
model.train()
log.debug('done eval')
return f1, em
def train(model, train_dataset, dev_dataset, dev_examples, dev_features,
tokenizer, args):
model = P.DataParallel(model)
max_steps = args.max_steps
g_clip = P.nn.ClipGradByGlobalNorm(1.0) #experimental
lr_scheduler = P.optimizer.lr.LambdaDecay(
args.lr,
get_warmup_and_linear_decay(max_steps,
int(args.warmup_proportion * max_steps)))
opt = AdamW(
lr_scheduler,
parameters=model.parameters(),
weight_decay=args.wd,
grad_clip=g_clip)
train_dataset = train_dataset \
.cache_shuffle_shard(env.nranks, env.dev_id, drop_last=True) \
.padded_batch(args.bsz)
log.debug('init training with args: %s' % repr(args))
scaler = P.amp.GradScaler(enable=args.use_amp)
create_if_not_exists(args.save_dir)
with P.amp.auto_cast(enable=args.use_amp):
for step, (_, token_ids, token_type_ids, start_pos,
end_pos) in enumerate(
P.io.DataLoader(
train_dataset,
places=P.CUDAPlace(env.dev_id),
batch_size=None)):
loss, _, __ = model(
token_ids,
token_type_ids,
start_pos=start_pos,
end_pos=end_pos)
loss = scaler.scale(loss)
loss.backward()
scaler.minimize(opt, loss)
model.clear_gradients()
lr_scheduler.step()
if env.dev_id == 0 and step % 10==0 and step:
_lr = lr_scheduler.get_lr()
if args.use_amp:
_l = (loss / scaler._scale).numpy()
msg = '[rank-%d][step-%d] train loss %.5f lr %.3e scaling %.3e' % (
env.dev_id, step, _l, _lr, scaler._scale.numpy())
else:
_l = loss.numpy()
msg = '[rank-%d][step-%d] train loss %.5f lr %.3e' % (
env.dev_id, step, _l, _lr)
log.debug(msg)
if env.dev_id == 0 and step % 100==0 and step:
print(step)
f1, em = evaluate(model, dev_dataset, dev_examples,
dev_features, tokenizer, args)
log.debug('[step %d] eval result: f1 %.5f em %.5f' %
(step, f1, em))
if env.dev_id == 0 and args.save_dir is not None:
P.save(model.state_dict(), args.save_dir / 'ckpt.bin')
if step > max_steps:
break
if __name__ == "__main__":
parser = argparse.ArgumentParser('MRC model with ERNIE')
parser.add_argument(
'--from_pretrained',
type=Path,
required=True,
help='pretrained model directory or tag')
parser.add_argument(
'--max_seqlen',
type=int,
default=512,
help='max sentence length, should not greater than 512')
parser.add_argument('--bsz', type=int, default=16, help='batchsize')
parser.add_argument('--max_steps', type=int, required=True, help='max steps')
parser.add_argument(
'--train_file',
type=str,
required=True,
help='data directory includes train / develop data')
parser.add_argument(
'--dev_file',
type=str,
required=True,
help='data directory includes train / develop data')
parser.add_argument('--warmup_proportion', type=float, default=0.0)
parser.add_argument('--lr', type=float, default=3e-5, help='learning rate')
parser.add_argument(
'--save_dir', type=Path, required=True, help='model output directory')
parser.add_argument(
'--n_best_size', type=int, default=20, help='nbest prediction to keep')
parser.add_argument(
'--max_answer_length', type=int, default=100, help='max answer span')
parser.add_argument(
'--wd',
type=float,
default=0.01,
help='weight decay, aka L2 regularizer')
parser.add_argument(
'--use_amp',
action='store_true',
help='only activate AMP(auto mixed precision accelatoin) on TensorCore compatible devices'
)
args = parser.parse_args()
env = P.distributed.ParallelEnv()
P.distributed.init_parallel_env()
tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained)
if not os.path.exists(args.train_file):
raise RuntimeError('input data not found at %s' % args.train_file)
if not os.path.exists(args.dev_file):
raise RuntimeError('input data not found at %s' % args.dev_file)
log.info('making train/dev data...')
train_examples = mrc_reader.read_files(args.train_file, is_training=True)
train_features = mrc_reader.convert_example_to_features(
train_examples, args.max_seqlen, tokenizer, is_training=True)
dev_examples = mrc_reader.read_files(args.dev_file, is_training=False)
dev_features = mrc_reader.convert_example_to_features(
dev_examples, args.max_seqlen, tokenizer, is_training=False)
log.info('train examples: %d, features: %d' %
(len(train_examples), len(train_features)))
def map_fn(unique_id, example_index, doc_span_index, tokens,
token_to_orig_map, token_is_max_context, token_ids,
position_ids, text_type_ids, start_position, end_position):
if start_position is None:
start_position = 0
if end_position is None:
end_position = 0
return np.array(unique_id), np.array(token_ids), np.array(
text_type_ids), np.array(start_position), np.array(end_position)
train_dataset = propeller.data.Dataset.from_list(train_features).map(
map_fn)
dev_dataset = propeller.data.Dataset.from_list(dev_features).map(
map_fn).padded_batch(args.bsz)
model = ErnieModelForQuestionAnswering.from_pretrained(
args.from_pretrained, name='')
train(model, train_dataset, dev_dataset, dev_examples, dev_features,
tokenizer, args)
if env.dev_id == 0:
f1, em = evaluate(model, dev_dataset, dev_examples, dev_features,
tokenizer, args)
log.debug('final eval result: f1 %.5f em %.5f' % (f1, em))
if env.dev_id == 0 and args.save_dir is not None:
P.save(model.state_dict(), args.save_dir / 'ckpt.bin')
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import re
import time
import logging
import six
import json
from random import random
from tqdm import tqdm
from collections import OrderedDict
from functools import reduce, partial
from pathlib import Path
from visualdl import LogWriter
import numpy as np
import multiprocessing
import pickle
import logging
from sklearn.metrics import f1_score
import paddle as P
from propeller import log
import propeller.paddle as propeller
log.setLevel(logging.DEBUG)
logging.getLogger().setLevel(logging.DEBUG)
from ernie_gram.utils import create_if_not_exists, get_warmup_and_linear_decay
from ernie.modeling_ernie import ErnieModel, ErnieModelForSequenceClassification, ErnieModelForTokenClassification
from ernie.tokenizing_ernie import ErnieTokenizer
from ernie_gram.optimization import AdamW
parser = propeller.ArgumentParser('NER model with ERNIE')
parser.add_argument('--max_seqlen', type=int, default=256)
parser.add_argument('--bsz', type=int, default=16)
parser.add_argument('--data_dir', type=str, required=True)
parser.add_argument('--epoch', type=int, default=10)
parser.add_argument(
'--warmup_proportion',
type=float,
default=0.1,
help='if use_lr_decay is set, '
'learning rate will raise to `lr` at `warmup_proportion` * `max_steps` and decay to 0. at `max_steps`'
)
parser.add_argument(
'--max_steps',
type=int,
required=True,
help='max_train_steps, set this to EPOCH * NUM_SAMPLES / BATCH_SIZE, used in learning rate scheduler'
)
parser.add_argument(
'--use_amp',
action='store_true',
help='only activate AMP(auto mixed precision accelatoin) on TensorCore compatible devices'
)
parser.add_argument('--from_pretrained', type=Path, required=True)
parser.add_argument('--lr', type=float, default=5e-5, help='learning rate')
parser.add_argument(
'--save_dir', type=Path, required=True, help='model output directory')
parser.add_argument(
'--wd', type=float, default=0.01, help='weight decay, aka L2 regularizer')
args = parser.parse_args()
tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained)
def tokenizer_func(inputs):
ret = inputs.split(b'\2')
tokens, orig_pos = [], []
for i, r in enumerate(ret):
t = tokenizer.tokenize(r)
for tt in t:
tokens.append(tt)
orig_pos.append(i)
assert len(tokens) == len(orig_pos)
return tokens + orig_pos
def tokenizer_func_for_label(inputs):
return inputs.split(b'\2')
feature_map = {
b"B-PER": 0,
b"I-PER": 1,
b"B-ORG": 2,
b"I-ORG": 3,
b"B-LOC": 4,
b"I-LOC": 5,
b"O": 6,
}
other_tag_id = feature_map[b'O']
feature_column = propeller.data.FeatureColumns([
propeller.data.TextColumn(
'text_a',
unk_id=tokenizer.unk_id,
vocab_dict=tokenizer.vocab,
tokenizer=tokenizer_func), propeller.data.TextColumn(
'label',
unk_id=other_tag_id,
vocab_dict=feature_map,
tokenizer=tokenizer_func_for_label, )
])
def before(seg, label):
seg, orig_pos = np.split(seg, 2)
aligned_label = label[orig_pos]
seg, _ = tokenizer.truncate(seg, [], args.max_seqlen)
aligned_label, _ = tokenizer.truncate(aligned_label, [], args.max_seqlen)
orig_pos, _ = tokenizer.truncate(orig_pos, [], args.max_seqlen)
sentence, segments = tokenizer.build_for_ernie(
seg
) #utils.data.build_1_pair(seg, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id)
aligned_label = np.concatenate([[0], aligned_label, [0]], 0)
orig_pos = np.concatenate([[0], orig_pos, [0]])
assert len(aligned_label) == len(sentence) == len(orig_pos), (
len(aligned_label), len(sentence), len(orig_pos)) # alinged
return sentence, segments, aligned_label, label, orig_pos
train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=False, use_gz=False) \
.map(before) \
.padded_batch(args.bsz, (0,0,-100, other_tag_id + 1, 0)) \
dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \
.map(before) \
.padded_batch(args.bsz, (0,0,-100, other_tag_id + 1,0)) \
test_ds = feature_column.build_dataset('test', data_dir=os.path.join(args.data_dir, 'test'), shuffle=False, repeat=False, use_gz=False) \
.map(before) \
.padded_batch(args.bsz, (0,0,-100, other_tag_id + 1,0)) \
def evaluate(model, dataset):
model.eval()
with P.no_grad():
chunkf1 = propeller.metrics.ChunkF1(None, None, None, len(feature_map))
for step, (ids, sids, aligned_label, label, orig_pos
) in enumerate(P.io.DataLoader(
dataset, batch_size=None)):
loss, logits = model(ids, sids)
#print('\n'.join(map(str, logits.numpy().tolist())))
assert orig_pos.shape[0] == logits.shape[0] == ids.shape[
0] == label.shape[0]
for pos, lo, la, id in zip(orig_pos.numpy(),
logits.numpy(),
label.numpy(), ids.numpy()):
_dic = OrderedDict()
assert len(pos) == len(lo) == len(id)
for _pos, _lo, _id in zip(pos, lo, id):
if _id > tokenizer.mask_id: # [MASK] is the largest special token
_dic.setdefault(_pos, []).append(_lo)
merged_lo = np.array(
[np.array(l).mean(0) for _, l in six.iteritems(_dic)])
merged_preds = np.argmax(merged_lo, -1)
la = la[np.where(la != (other_tag_id + 1))] #remove pad
if len(la) > len(merged_preds):
log.warn(
'accuracy loss due to truncation: label len:%d, truncate to %d'
% (len(la), len(merged_preds)))
merged_preds = np.pad(merged_preds,
[0, len(la) - len(merged_preds)],
mode='constant',
constant_values=7)
else:
assert len(la) == len(
merged_preds
), 'expect label == prediction, got %d vs %d' % (
la.shape, merged_preds.shape)
chunkf1.update((merged_preds, la, np.array(len(la))))
#f1 = f1_score(np.concatenate(all_label), np.concatenate(all_pred), average='macro')
f1 = chunkf1.eval()
model.train()
return f1
model = ErnieModelForTokenClassification.from_pretrained(
args.from_pretrained,
num_labels=len(feature_map),
name='',
has_pooler=False)
g_clip = P.nn.ClipGradByGlobalNorm(1.0) #experimental
param_name_to_exclue_from_weight_decay = re.compile(
r'.*layer_norm_scale|.*layer_norm_bias|.*b_0')
lr_scheduler = P.optimizer.lr.LambdaDecay(
args.lr,
get_warmup_and_linear_decay(args.max_steps,
int(args.warmup_proportion * args.max_steps)))
opt = AdamW(
lr_scheduler,
parameters=model.parameters(),
weight_decay=args.wd,
apply_decay_param_fun=lambda n: not param_name_to_exclue_from_weight_decay.match(n),
grad_clip=g_clip)
scaler = P.amp.GradScaler(enable=args.use_amp)
with LogWriter(
logdir=str(create_if_not_exists(args.save_dir / 'vdl'))) as log_writer:
with P.amp.auto_cast(enable=args.use_amp):
for epoch in range(args.epoch):
for step, (
ids, sids, aligned_label, label, orig_pos
) in enumerate(P.io.DataLoader(
train_ds, batch_size=None)):
loss, logits = model(ids, sids, labels=aligned_label)
#loss, logits = model(ids, sids, labels=aligned_label, loss_weights=P.cast(ids != 0, 'float32'))
loss = scaler.scale(loss)
loss.backward()
scaler.minimize(opt, loss)
model.clear_gradients()
lr_scheduler.step()
if step % 10 == 0:
_lr = lr_scheduler.get_lr()
if args.use_amp:
_l = (loss / scaler._scale).numpy()
msg = '[step-%d] train loss %.5f lr %.3e scaling %.3e' % (
step, _l, _lr, scaler._scale.numpy())
else:
_l = loss.numpy()
msg = '[step-%d] train loss %.5f lr %.3e' % (step, _l,
_lr)
log.debug(msg)
log_writer.add_scalar('loss', _l, step=step)
log_writer.add_scalar('lr', _lr, step=step)
if step % 100 == 0:
f1 = evaluate(model, dev_ds)
log.debug('dev eval f1: %.5f' % f1)
log_writer.add_scalar('dev eval/f1', f1, step=step)
f1 = evaluate(model, test_ds)
log.debug('test eval f1: %.5f' % f1)
log_writer.add_scalar('test eval/f1', f1, step=step)
if args.save_dir is not None:
P.save(model.state_dict(), args.save_dir / 'ckpt.bin')
f1 = evaluate(model, dev_ds)
log.debug('final eval f1: %.5f' % f1)
log_writer.add_scalar('eval/f1', f1, step=step)
if args.save_dir is not None:
P.save(model.state_dict(), args.save_dir / 'ckpt.bin')
此差异已折叠。
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import division
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals
import sys
import argparse
import logging
from functools import partial
from io import open
open = partial(open, encoding='utf-8')
import json
from collections import namedtuple
log = logging.getLogger(__name__)
Example = namedtuple('Example', [
'qas_id', 'question_text', 'doc_tokens', 'orig_answer_text',
'start_position', 'end_position'
])
Feature = namedtuple("Feature", [
"unique_id", "example_index", "doc_span_index", "tokens",
"token_to_orig_map", "token_is_max_context", "token_ids", "position_ids",
"text_type_ids", "start_position", "end_position"
])
def _tokenize_chinese_chars(text):
"""Adds whitespace around any CJK character."""
def _is_chinese_char(cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True
return False
output = []
buff = ""
for char in text:
cp = ord(char)
if _is_chinese_char(cp):
if buff != "":
output.append(buff)
buff = ""
output.append(char)
else:
buff += char
if buff != "":
output.append(buff)
return output
def _check_is_max_context(doc_spans, cur_span_index, position):
"""chech is max context"""
best_score = None
best_span_index = None
for (span_index, doc_span) in enumerate(doc_spans):
end = doc_span.start + doc_span.length - 1
if position < doc_span.start:
continue
if position > end:
continue
num_left_context = position - doc_span.start
num_right_context = end - position
score = min(num_left_context,
num_right_context) + 0.01 * doc_span.length
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
orig_answer_text):
"""improve answer span"""
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
for new_start in range(input_start, input_end + 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
if text_span == tok_answer_text:
return (new_start, new_end)
return (input_start, input_end)
def read_files(input_file, is_training):
"""read file"""
examples = []
with open(input_file, "r") as f:
input_data = json.load(f)["data"]
for entry in input_data:
for paragraph in entry["paragraphs"]:
paragraph_text = paragraph["context"]
for qa in paragraph["qas"]:
qas_id = qa["id"]
question_text = qa["question"]
start_pos = None
end_pos = None
orig_answer_text = None
if is_training:
if len(qa["answers"]) != 1:
raise ValueError(
"For training, each question should have exactly 1 answer."
)
answer = qa["answers"][0]
orig_answer_text = answer["text"]
answer_offset = answer["answer_start"]
answer_length = len(orig_answer_text)
doc_tokens = [
paragraph_text[:answer_offset], paragraph_text[
answer_offset:answer_offset + answer_length],
paragraph_text[answer_offset + answer_length:]
]
start_pos = 1
end_pos = 1
actual_text = " ".join(doc_tokens[start_pos:(end_pos +
1)])
if actual_text.find(orig_answer_text) == -1:
log.info("Could not find answer: '%s' vs. '%s'",
actual_text, orig_answer_text)
continue
else:
doc_tokens = _tokenize_chinese_chars(paragraph_text)
example = Example(
qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
orig_answer_text=orig_answer_text,
start_position=start_pos,
end_position=end_pos)
examples.append(example)
return examples
def convert_example_to_features(examples,
max_seq_length,
tokenizer,
is_training,
doc_stride=128,
max_query_length=64):
"""convert example to feature"""
features = []
unique_id = 1000000000
for (example_index, example) in enumerate(examples):
query_tokens = tokenizer.tokenize(example.question_text)
if len(query_tokens) > max_query_length:
query_tokens = query_tokens[0:max_query_length]
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
#log.info(orig_to_tok_index, example.start_position)
tok_start_position = None
tok_end_position = None
if is_training:
tok_start_position = orig_to_tok_index[example.start_position]
if example.end_position < len(example.doc_tokens) - 1:
tok_end_position = orig_to_tok_index[example.end_position +
1] - 1
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position, tok_end_position) = _improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position,
tokenizer, example.orig_answer_text)
max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
_DocSpan = namedtuple("DocSpan", ["start", "length"])
doc_spans = []
start_offset = 0
while start_offset < len(all_doc_tokens):
length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc:
length = max_tokens_for_doc
doc_spans.append(_DocSpan(start=start_offset, length=length))
if start_offset + length == len(all_doc_tokens):
break
start_offset += min(length, doc_stride)
for (doc_span_index, doc_span) in enumerate(doc_spans):
tokens = []
token_to_orig_map = {}
token_is_max_context = {}
text_type_ids = []
tokens.append("[CLS]")
text_type_ids.append(0)
for token in query_tokens:
tokens.append(token)
text_type_ids.append(0)
tokens.append("[SEP]")
text_type_ids.append(0)
for i in range(doc_span.length):
split_token_index = doc_span.start + i
token_to_orig_map[len(tokens)] = tok_to_orig_index[
split_token_index]
is_max_context = _check_is_max_context(
doc_spans, doc_span_index, split_token_index)
token_is_max_context[len(tokens)] = is_max_context
tokens.append(all_doc_tokens[split_token_index])
text_type_ids.append(1)
tokens.append("[SEP]")
text_type_ids.append(1)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
position_ids = list(range(len(token_ids)))
start_position = None
end_position = None
if is_training:
doc_start = doc_span.start
doc_end = doc_span.start + doc_span.length - 1
out_of_span = False
if not (tok_start_position >= doc_start and
tok_end_position <= doc_end):
out_of_span = True
if out_of_span:
start_position = 0
end_position = 0
else:
doc_offset = len(query_tokens) + 2
start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset
feature = Feature(
unique_id=unique_id,
example_index=example_index,
doc_span_index=doc_span_index,
tokens=tokens,
token_to_orig_map=token_to_orig_map,
token_is_max_context=token_is_max_context,
token_ids=token_ids,
position_ids=position_ids,
text_type_ids=text_type_ids,
start_position=start_position,
end_position=end_position)
features.append(feature)
unique_id += 1
return features
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='main')
parser.add_argument("--input", type=str, default=None)
args = parser.parse_args()
from ernie.tokenizing_ernie import ErnieTokenizer
tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')
examples = read_files(args.input, True)
features = convert_example_to_features(examples, 512, tokenizer, True)
log.debug(len(examples))
log.debug(len(features))
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import absolute_import
import logging
import re
from paddle.fluid import framework
from paddle.fluid.framework import Variable, default_main_program
import numpy as np
import paddle as P
import paddle.distributed.fleet as fleet
from propeller.paddle.train.hooks import RunHook
import paddle.fluid as F
log = logging.getLogger(__name__)
from ernie_gram.utils import create_if_not_exists, get_warmup_and_linear_decay
class AdamW(P.optimizer.AdamW):
"""AdamW object for dygraph"""
def __init__(self, *args, **kwargs):
layerwise_lr_decay = kwargs.pop('layerwise_lr_decay_rate', 0.8)
n_layers = kwargs.pop('n_layers', 12)
var_name_to_exclude = kwargs.pop('var_name_to_exclude', '.*layer_norm_scale|.*layer_norm_bias|.*b_0')
super(AdamW, self).__init__(*args, **kwargs)
self.ld = layerwise_lr_decay
self.pat = re.compile(var_name_to_exclude)
self.n_layers = n_layers
def _get_layerwise_lr_decay_rate(self, param):
#if self.pat.match(param.name):
# return 1.0
if param.name.startswith("encoder_layer"):
layer = int(param.name.split("_")[2])
decay_rate = self.ld ** (self.n_layers - layer)
elif "embedding" in param.name:
decay_rate = self.ld ** (self.n_layers + 1)
else:
decay_rate = 1.0
return decay_rate
def _create_param_lr(self, param_and_grad):
# create learning rate tensor for every parameter
param = param_and_grad[0]
param_lr = param.optimize_attr['learning_rate'] * self._get_layerwise_lr_decay_rate(param)
if type(param_lr) == Variable:
return param_lr
else:
if param_lr == 1.0:
return self._global_learning_rate()
else:
with default_main_program()._lr_schedule_guard(
is_with_opt=True), framework.name_scope(
'scale_with_param_lr'):
return self._global_learning_rate() * param_lr
def apply_optimize(self, loss, startup_program, params_grads):
super(AdamW, self).apply_optimize(loss, startup_program, params_grads)
for p, g in params_grads:
#log.debug(L.reduce_mean(p))
if not self.pat.match(p.name):
L.assign(p * (1. - self.wd * self.current_step_lr()), p)
def optimization(
loss,
warmup_steps,
num_train_steps,
learning_rate,
train_program,
startup_prog,
weight_decay,
scheduler='linear_warmup_decay',
use_fp16=False, ):
"""do backword for static"""
def exclude_from_weight_decay(param):
name = param.rstrip('.master')
if name.find("layer_norm") > -1:
return True
bias_suffix = ["_bias", "_b", ".b_0"]
for suffix in bias_suffix:
if name.endswith(suffix):
return True
return False
g_clip = P.nn.ClipGradByGlobalNorm(1.0)
lr_scheduler = P.optimizer.lr.LambdaDecay(
learning_rate,
get_warmup_and_linear_decay(num_train_steps, warmup_steps))
optimizer = AdamW(
learning_rate=lr_scheduler,
weight_decay=weight_decay,
grad_clip=g_clip,
apply_decay_param_fun=exclude_from_weight_decay)
if use_fp16:
log.info('AMP activated')
if weight_decay > 0.:
raise ValueError(
'paddle amp will ignore `weight_decay`, see https://github.com/PaddlePaddle/Paddle/issues/29794'
)
#amp_list = P.fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
# custom_white_list=['softmax', 'layer_norm', 'gelu'])
optimizer = P.fluid.contrib.mixed_precision.decorate(
optimizer, init_loss_scaling=2**15, use_dynamic_loss_scaling=True)
_, param_grads = optimizer.minimize(loss)
loss_scaling = P.static.default_main_program().global_block().var(
'loss_scaling_0')
else:
_, param_grads = optimizer.minimize(loss)
loss_scaling = None
class LRStepHook(RunHook):
def after_run(self, _, __):
lr_scheduler.step()
log.debug('lr step: %.5f' % lr_scheduler.get_lr())
return LRStepHook(), loss_scaling
source $1
python3 -m paddle.distributed.launch ./ernie_gram/finetune_classifier_distributed.py \
--data_dir $data_dir \
--max_steps $max_steps \
--bsz $bsz \
--lr $lr \
--label_map ${label_map:-""} \
--num_labels $num_labels \
--pair_input $pair_input \
--valid_steps $valid_steps \
--from_pretrained $from_pretrained \
--save_dir checkpoints
source $1
export CUDA_VISIBLE_DEVICES=0
python3 -m paddle.distributed.launch ./ernie_gram/finetune_mrc.py \
--train_file $train_file \
--dev_file $dev_file \
--max_steps $max_steps \
--lr $lr \
--from_pretrained $from_pretrained \
--save_dir checkpoints
source $1
python3 -m paddle.distributed.launch ./ernie_gram/finetune_ner.py \
--data_dir $data_dir \
--max_steps $max_steps \
--epoch $epoch \
--lr $lr \
--from_pretrained $from_pretrained \
--save_dir checkpoints
train_file="data/cmrc2018/train/train.json"
dev_file="data/cmrc2018/dev/dev.json"
max_steps=1320
lr=1.5e-4
from_pretrained="ernie-gram-zh"
data_dir="data/msra_ner"
epoch=10
max_steps=13040
lr=5e-5
from_pretrained="ernie-gram-zh"
data_dir="data/xnli"
max_steps=4600 #3 epoch
lr=1.5e-4
label_map='{"contradictory":0,"contradiction":0,"entailment":1,"neutral":2}'
num_labels=3
valid_steps=25
from_pretrained="ernie-gram-zh"
pair_input=1
bsz=32
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import division
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals
import sys
import argparse
import logging
import paddle
class UnpackDataLoader(paddle.io.DataLoader):
def __init__(self, *args, **kwargs):
super(UnpackDataLoader, self).__init__(*args, batch_size=1, **kwargs)
def __iter__(self):
return ([yy[0] for yy in y]
for y in super(UnpackDataLoader, self).__iter__())
def create_if_not_exists(dir):
try:
dir.mkdir(parents=True)
except FileExistsError:
pass
return dir
def get_warmup_and_linear_decay(max_steps, warmup_steps):
if warmup_steps == 0:
return lambda step: 1.0
else:
return lambda step: min(step / warmup_steps, 1. - (step - warmup_steps) / (max_steps - warmup_steps))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册