diff --git a/official/nlp/bert/config.py b/official/nlp/bert/config_args.py similarity index 100% rename from official/nlp/bert/config.py rename to official/nlp/bert/config_args.py diff --git a/official/nlp/bert/model.py b/official/nlp/bert/model.py index 1e65929ebd26bddad0d18a764224e4d3663fbaa6..879879ffd4810ff77327ec65275e6c17db18483b 100644 --- a/official/nlp/bert/model.py +++ b/official/nlp/bert/model.py @@ -23,7 +23,6 @@ import copy import json import math import os -import sys import urllib import urllib.request from io import open @@ -39,7 +38,7 @@ from megengine.module.activation import Softmax def transpose(inp, a, b): - cur_shape = [i for i in range(0, len(inp.shape))] + cur_shape = list(range(0, len(inp.shape))) cur_shape[a], cur_shape[b] = cur_shape[b], cur_shape[a] return inp.dimshuffle(*cur_shape) @@ -84,7 +83,7 @@ def gelu(x): ACT2FN = {"gelu": gelu, "relu": F.relu} -class BertConfig(object): +class BertConfig: """Configuration class to store the configuration of a `BertModel`. """ @@ -441,6 +440,7 @@ class BertModel(Module): """ def __init__(self, config): + super().__init__() self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) @@ -537,6 +537,7 @@ class BertForSequenceClassification(Module): """ def __init__(self, config, num_labels, bert=None): + super().__init__() if bert is None: self.bert = BertModel(config) else: @@ -577,10 +578,8 @@ MODEL_NAME = { def download_file(url, filename): - try: - urllib.URLopener().retrieve(url, filename) - except: - urllib.request.urlretrieve(url, filename) + # urllib.URLopener().retrieve(url, filename) + urllib.request.urlretrieve(url, filename) def create_hub_bert(model_name, pretrained): diff --git a/official/nlp/bert/mrpc_dataset.py b/official/nlp/bert/mrpc_dataset.py index f3042a9b68ec098a4fc37726847587528acca5da..48397812f3d6fb09283619800ae0b26968e43cee 100644 --- a/official/nlp/bert/mrpc_dataset.py +++ b/official/nlp/bert/mrpc_dataset.py @@ -20,7 +20,7 @@ from tokenization import BertTokenizer logger = mge.get_logger(__name__) -class DataProcessor(object): +class DataProcessor: """Base class for data converters for sequence classification data sets.""" def get_train_examples(self, data_dir): @@ -46,7 +46,7 @@ class DataProcessor(object): return lines -class InputFeatures(object): +class InputFeatures: """A single set of features of data.""" def __init__(self, input_ids, input_mask, segment_ids, label_id): @@ -56,7 +56,7 @@ class InputFeatures(object): self.label_id = label_id -class InputExample(object): +class InputExample: """A single training/test example for simple sequence classification.""" def __init__(self, guid, text_a, text_b=None, label=None): @@ -195,12 +195,12 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer label_id = label_map[example.label] if ex_index < 0: logger.info("*** Example ***") - logger.info("guid: %s" % (example.guid)) - logger.info("tokens: %s" % " ".join([str(x) for x in tokens])) - logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) - logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) - logger.info("label: %s (id = %d)" % (example.label, label_id)) + logger.info("guid: {}".format(example.guid)) + logger.info("tokens: {}".format(" ".join([str(x) for x in tokens]))) + logger.info("input_ids: {}".format(" ".join([str(x) for x in input_ids]))) + logger.info("input_mask: {}".format(" ".join([str(x) for x in input_mask]))) + logger.info("segment_ids: {}".format(" ".join([str(x) for x in segment_ids]))) + logger.info("label: {} (id = {})".format(example.label, label_id)) features.append( InputFeatures( diff --git a/official/nlp/bert/test.py b/official/nlp/bert/test.py index ebb7ad4c70fc27b42f1b7ac27f816cdc3b14442d..924d0adae85ff83c125579babdc56b58660285c7 100644 --- a/official/nlp/bert/test.py +++ b/official/nlp/bert/test.py @@ -12,16 +12,16 @@ import megengine.functional as F from megengine.jit import trace from tqdm import tqdm -from config import get_args from model import BertForSequenceClassification, create_hub_bert from mrpc_dataset import MRPCDataset - -args = get_args() +# pylint: disable=import-outside-toplevel +import config_args +args = config_args.get_args() logger = mge.get_logger(__name__) @trace(symbolic=True) -def net_eval(input_ids, segment_ids, input_mask, label_ids, opt=None, net=None): +def net_eval(input_ids, segment_ids, input_mask, label_ids, net=None): net.eval() results = net(input_ids, segment_ids, input_mask, label_ids) logits, loss = results @@ -39,7 +39,7 @@ def eval(dataloader, net): sum_loss, sum_accuracy, total_steps, total_examples = 0, 0, 0, 0 - for step, batch in enumerate(tqdm(dataloader, desc="Iteration")): + for _, batch in enumerate(tqdm(dataloader, desc="Iteration")): input_ids, input_mask, segment_ids, label_ids = tuple( mge.tensor(t) for t in batch ) diff --git a/official/nlp/bert/tokenization.py b/official/nlp/bert/tokenization.py index 1ee550a4862cb59ac024ae19e92b2a366c19d99d..20b06005fc7fb146a509481b5b7d5d6d4e0be897 100644 --- a/official/nlp/bert/tokenization.py +++ b/official/nlp/bert/tokenization.py @@ -22,7 +22,7 @@ import os import unicodedata from io import open -import megengine as megengine +import megengine logger = megengine.get_logger(__name__) @@ -54,7 +54,7 @@ def whitespace_tokenize(text): return tokens -class BertTokenizer(object): +class BertTokenizer: """Runs end-to-end tokenization: punctuation splitting + wordpiece""" def __init__( @@ -150,7 +150,7 @@ class BertTokenizer(object): return vocab_file -class BasicTokenizer(object): +class BasicTokenizer: """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" def __init__( @@ -243,18 +243,19 @@ class BasicTokenizer(object): # as is Japanese Hiragana and Katakana. Those alphabets are used to write # space-separated words, so they are not treated specially and handled # like the all of the other languages. - if ( - (cp >= 0x4E00 and cp <= 0x9FFF) - or (cp >= 0x3400 and cp <= 0x4DBF) # - or (cp >= 0x20000 and cp <= 0x2A6DF) # - or (cp >= 0x2A700 and cp <= 0x2B73F) # - or (cp >= 0x2B740 and cp <= 0x2B81F) # - or (cp >= 0x2B820 and cp <= 0x2CEAF) # - or (cp >= 0xF900 and cp <= 0xFAFF) - or (cp >= 0x2F800 and cp <= 0x2FA1F) # - ): # - return True - + cp_range = [ + (0x4E00, 0x9FFF), + (0x3400, 0x4DBF), + (0x20000, 0x2A6DF), + (0x2A700, 0x2B73F), + (0x2B740, 0x2B81F), + (0x2B820, 0x2CEAF), + (0xF900, 0xFAFF), + (0x2F800, 0x2FA1F), + ] + for min_cp, max_cp in cp_range: + if min_cp <= cp <= max_cp: + return True return False def _clean_text(self, text): @@ -271,7 +272,7 @@ class BasicTokenizer(object): return "".join(output) -class WordpieceTokenizer(object): +class WordpieceTokenizer: """Runs WordPiece tokenization.""" def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): @@ -335,7 +336,7 @@ def _is_whitespace(char): """Checks whether `chars` is a whitespace character.""" # \t, \n, and \r are technically contorl characters but we treat them # as whitespace since they are generally considered as such. - if char == " " or char == "\t" or char == "\n" or char == "\r": + if char in (" ", "\t", "\n", "\r"): return True cat = unicodedata.category(char) if cat == "Zs": @@ -347,7 +348,7 @@ def _is_control(char): """Checks whether `chars` is a control character.""" # These are technically control characters but we count them as whitespace # characters. - if char == "\t" or char == "\n" or char == "\r": + if char in ("\t", "\n", "\r"): return False cat = unicodedata.category(char) if cat.startswith("C"): @@ -363,10 +364,10 @@ def _is_punctuation(char): # Punctuation class but we treat them as punctuation anyways, for # consistency. if ( - (cp >= 33 and cp <= 47) - or (cp >= 58 and cp <= 64) - or (cp >= 91 and cp <= 96) - or (cp >= 123 and cp <= 126) + (33 <= cp <= 47) + or (58 <= cp <= 64) + or (91 <= cp <= 96) + or (123 <= cp <= 126) ): return True cat = unicodedata.category(char) diff --git a/official/nlp/bert/train.py b/official/nlp/bert/train.py index 7b93903df8b72610066e297db8d66417b65d8206..1b82d84bb2ab3670e9c5df6777a519c62bc7e60d 100644 --- a/official/nlp/bert/train.py +++ b/official/nlp/bert/train.py @@ -13,16 +13,16 @@ import megengine.optimizer as optim from megengine.jit import trace from tqdm import tqdm -from config import get_args from model import BertForSequenceClassification, create_hub_bert from mrpc_dataset import MRPCDataset - -args = get_args() +# pylint: disable=import-outside-toplevel +import config_args +args = config_args.get_args() logger = mge.get_logger(__name__) @trace(symbolic=True) -def net_eval(input_ids, segment_ids, input_mask, label_ids, opt=None, net=None): +def net_eval(input_ids, segment_ids, input_mask, label_ids, net=None): net.eval() results = net(input_ids, segment_ids, input_mask, label_ids) logits, loss = results @@ -49,7 +49,7 @@ def eval(dataloader, net): sum_loss, sum_accuracy, total_steps, total_examples = 0, 0, 0, 0 - for step, batch in enumerate(tqdm(dataloader, desc="Iteration")): + for _, batch in enumerate(tqdm(dataloader, desc="Iteration")): input_ids, input_mask, segment_ids, label_ids = tuple( mge.tensor(t) for t in batch ) @@ -79,7 +79,7 @@ def train(dataloader, net, opt): logger.info("batch size = %d", args.train_batch_size) sum_loss, sum_accuracy, total_steps, total_examples = 0, 0, 0, 0 - for step, batch in enumerate(tqdm(dataloader, desc="Iteration")): + for _, batch in enumerate(tqdm(dataloader, desc="Iteration")): input_ids, input_mask, segment_ids, label_ids = tuple( mge.tensor(t) for t in batch )