diff --git a/chapter06/lstm/eval.py b/chapter06/lstm/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..6d731fbd0dfd4e2fd3f40233e79e537dd7d0e83f --- /dev/null +++ b/chapter06/lstm/eval.py @@ -0,0 +1,81 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +#################train lstm example on aclImdb######################## +python eval.py --ckpt_path=./lstm-20-390.ckpt +""" +import argparse +import os + +import numpy as np + +from src.config import lstm_cfg as cfg +from src.dataset import lstm_create_dataset, convert_to_mindrecord +from src.lstm import SentimentNet +from mindspore import Tensor, nn, Model, context +from mindspore.nn import Accuracy +from mindspore.train.callback import LossMonitor +from mindspore.train.serialization import load_checkpoint, load_param_into_net + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='MindSpore LSTM Example') + parser.add_argument('--preprocess', type=str, default='false', choices=['true', 'false'], + help='whether to preprocess data.') + parser.add_argument('--aclimdb_path', type=str, default="./aclImdb", + help='path where the dataset is stored.') + parser.add_argument('--glove_path', type=str, default="./glove", + help='path where the GloVe is stored.') + parser.add_argument('--preprocess_path', type=str, default="./preprocess", + help='path where the pre-process data is stored.') + parser.add_argument('--ckpt_path', type=str, default=None, + help='the checkpoint file path used to evaluate model.') + parser.add_argument('--device_target', type=str, default="GPU", choices=['GPU', 'CPU'], + help='the target device to run, support "GPU", "CPU". Default: "GPU".') + args = parser.parse_args() + + context.set_context( + mode=context.GRAPH_MODE, + save_graphs=False, + device_target=args.device_target) + + if args.preprocess == "true": + print("============== Starting Data Pre-processing ==============") + convert_to_mindrecord(cfg.embed_size, args.aclimdb_path, args.preprocess_path, args.glove_path) + + embedding_table = np.loadtxt(os.path.join(args.preprocess_path, "weight.txt")).astype(np.float32) + network = SentimentNet(vocab_size=embedding_table.shape[0], + embed_size=cfg.embed_size, + num_hiddens=cfg.num_hiddens, + num_layers=cfg.num_layers, + bidirectional=cfg.bidirectional, + num_classes=cfg.num_classes, + weight=Tensor(embedding_table), + batch_size=cfg.batch_size) + + loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) + opt = nn.Momentum(network.trainable_params(), cfg.learning_rate, cfg.momentum) + loss_cb = LossMonitor() + + model = Model(network, loss, opt, {'acc': Accuracy()}) + + print("============== Starting Testing ==============") + ds_eval = lstm_create_dataset(args.preprocess_path, cfg.batch_size, training=False) + param_dict = load_checkpoint(args.ckpt_path) + load_param_into_net(network, param_dict) + if args.device_target == "CPU": + acc = model.eval(ds_eval, dataset_sink_mode=False) + else: + acc = model.eval(ds_eval) + print("============== {} ==============".format(acc)) diff --git a/chapter06/lstm/main.py b/chapter06/lstm/main.py deleted file mode 100644 index c9c794dd811b1832f1a262de87f919cb40b6692f..0000000000000000000000000000000000000000 --- a/chapter06/lstm/main.py +++ /dev/null @@ -1,341 +0,0 @@ -# Copyright 2019 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -""" -LSTM Sample, has train and evaluate part. -The sample can only be run on GPU. -""" -import os -import shutil -import math -import argparse -import json -from itertools import chain -import numpy as np -from config import lstm_cfg as cfg - -import mindspore.nn as nn -import mindspore.context as context -import mindspore.dataset as ds -from mindspore.ops import operations as P -from mindspore import Tensor -from mindspore.common.initializer import initializer -from mindspore.common.parameter import Parameter -from mindspore.mindrecord import FileWriter -from mindspore.train import Model -from mindspore.nn.metrics import Accuracy -from mindspore.train.serialization import load_checkpoint, load_param_into_net -from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor -# Install gensim with 'pip install gensim' -import gensim - - -def encode_samples(tokenized_samples, word_to_idx): - """ encode word to index """ - features = [] - for sample in tokenized_samples: - feature = [] - for token in sample: - if token in word_to_idx: - feature.append(word_to_idx[token]) - else: - feature.append(0) - features.append(feature) - return features - -def pad_samples(features, maxlen=500, pad=0): - """ pad all features to the same length """ - padded_features = [] - for feature in features: - if len(feature) >= maxlen: - padded_feature = feature[:maxlen] - else: - padded_feature = feature - while len(padded_feature) < maxlen: - padded_feature.append(pad) - padded_features.append(padded_feature) - return padded_features - -def read_imdb(path, seg='train'): - """ read imdb dataset """ - pos_or_neg = ['pos', 'neg'] - data = [] - for label in pos_or_neg: - files = os.listdir(os.path.join(path, seg, label)) - for file in files: - with open(os.path.join(path, seg, label, file), 'r', encoding='utf8') as rf: - review = rf.read().replace('\n', '') - if label == 'pos': - data.append([review, 1]) - elif label == 'neg': - data.append([review, 0]) - return data - -def tokenizer(text): - return [tok.lower() for tok in text.split(' ')] - -def collect_weight(glove_path, vocab, word_to_idx, embed_size): - """ collect weight """ - vocab_size = len(vocab) - wvmodel = gensim.models.KeyedVectors.load_word2vec_format(os.path.join(glove_path, 'glove.6B.300d.txt'), - binary=False, encoding='utf-8') - weight_np = np.zeros((vocab_size+1, embed_size)).astype(np.float32) - - idx_to_word = {i+1: word for i, word in enumerate(vocab)} - idx_to_word[0] = '' - - for i in range(len(wvmodel.index2word)): - try: - index = word_to_idx[wvmodel.index2word[i]] - except KeyError: - continue - weight_np[index, :] = wvmodel.get_vector( - idx_to_word[word_to_idx[wvmodel.index2word[i]]]) - return weight_np - -def preprocess(aclimdb_path, glove_path, embed_size): - """ preprocess the train and test data """ - train_data = read_imdb(aclimdb_path, 'train') - test_data = read_imdb(aclimdb_path, 'test') - - train_tokenized = [] - test_tokenized = [] - for review, _ in train_data: - train_tokenized.append(tokenizer(review)) - for review, _ in test_data: - test_tokenized.append(tokenizer(review)) - - vocab = set(chain(*train_tokenized)) - vocab_size = len(vocab) - print("vocab_size: ", vocab_size) - - word_to_idx = {word: i+1 for i, word in enumerate(vocab)} - word_to_idx[''] = 0 - - train_features = np.array(pad_samples(encode_samples(train_tokenized, word_to_idx))).astype(np.int32) - train_labels = np.array([score for _, score in train_data]).astype(np.int32) - test_features = np.array(pad_samples(encode_samples(test_tokenized, word_to_idx))).astype(np.int32) - test_labels = np.array([score for _, score in test_data]).astype(np.int32) - - weight_np = collect_weight(glove_path, vocab, word_to_idx, embed_size) - return train_features, train_labels, test_features, test_labels, weight_np, vocab_size - -def get_imdb_data(labels_data, features_data): - data_list = [] - for i, (label, feature) in enumerate(zip(labels_data, features_data)): - data_json = {"id": i, - "label": int(label), - "feature": feature.reshape(-1)} - data_list.append(data_json) - return data_list - -def convert_to_mindrecord(embed_size, aclimdb_path, proprocess_path, glove_path): - """ convert imdb dataset to mindrecord """ - num_shard = 4 - train_features, train_labels, test_features, test_labels, weight_np, _ = \ - preprocess(aclimdb_path, glove_path, embed_size) - np.savetxt(os.path.join(proprocess_path, 'weight.txt'), weight_np) - - # write mindrecord - schema_json = {"id": {"type": "int32"}, - "label": {"type": "int32"}, - "feature": {"type": "int32", "shape":[-1]}} - - writer = FileWriter(os.path.join(proprocess_path, 'aclImdb_train.mindrecord'), num_shard) - data = get_imdb_data(train_labels, train_features) - writer.add_schema(schema_json, "nlp_schema") - writer.add_index(["id", "label"]) - writer.write_raw_data(data) - writer.commit() - - writer = FileWriter(os.path.join(proprocess_path, 'aclImdb_test.mindrecord'), num_shard) - data = get_imdb_data(test_labels, test_features) - writer.add_schema(schema_json, "nlp_schema") - writer.add_index(["id", "label"]) - writer.write_raw_data(data) - writer.commit() - -def init_lstm_weight( - input_size, - hidden_size, - num_layers, - bidirectional, - has_bias=True): - """Initialize lstm weight.""" - num_directions = 1 - if bidirectional: - num_directions = 2 - - weight_size = 0 - gate_size = 4 * hidden_size - for layer in range(num_layers): - for _ in range(num_directions): - input_layer_size = input_size if layer == 0 else hidden_size * num_directions - weight_size += gate_size * input_layer_size - weight_size += gate_size * hidden_size - if has_bias: - weight_size += 2 * gate_size - - stdv = 1 / math.sqrt(hidden_size) - w_np = np.random.uniform(-stdv, stdv, (weight_size, - 1, 1)).astype(np.float32) - w = Parameter( - initializer( - Tensor(w_np), [ - weight_size, 1, 1]), name='weight') - - return w - - -def lstm_default_state(batch_size, hidden_size, num_layers, bidirectional): - """init default input.""" - num_directions = 1 - if bidirectional: - num_directions = 2 - - h = Tensor( - np.zeros((num_layers * num_directions, batch_size, hidden_size)).astype(np.float32)) - c = Tensor( - np.zeros((num_layers * num_directions, batch_size, hidden_size)).astype(np.float32)) - return h, c - - -class SentimentNet(nn.Cell): - """Sentiment network structure.""" - def __init__(self, - vocab_size, - embed_size, - num_hiddens, - num_layers, - bidirectional, - num_classes, - weight, - batch_size): - super(SentimentNet, self).__init__() - self.embedding = nn.Embedding(vocab_size, - embed_size, - embedding_table=weight) - self.embedding.embedding_table.requires_grad = False - self.trans = P.Transpose() - self.perm = (1, 0, 2) - self.encoder = nn.LSTM(input_size=embed_size, - hidden_size=num_hiddens, - num_layers=num_layers, - has_bias=True, - bidirectional=bidirectional, - dropout=0.0) - w_init = init_lstm_weight( - embed_size, - num_hiddens, - num_layers, - bidirectional) - self.encoder.weight = w_init - self.h, self.c = lstm_default_state(batch_size, num_hiddens, num_layers, bidirectional) - - self.concat = P.Concat(1) - if bidirectional: - self.decoder = nn.Dense(num_hiddens * 4, num_classes) - else: - self.decoder = nn.Dense(num_hiddens * 2, num_classes) - - def construct(self, inputs): - # (64,500,300) - embeddings = self.embedding(inputs) - embeddings = self.trans(embeddings, self.perm) - output, _ = self.encoder(embeddings, (self.h, self.c)) - # states[i] size(64,200) -> encoding.size(64,400) - encoding = self.concat((output[0], output[1])) - outputs = self.decoder(encoding) - return outputs - - -def create_dataset(base_path, batch_size, num_epochs, is_train): - """Create dataset for training.""" - columns_list = ["feature", "label"] - num_consumer = 4 - - if is_train: - path = os.path.join(base_path, 'aclImdb_train.mindrecord0') - else: - path = os.path.join(base_path, 'aclImdb_test.mindrecord0') - - dtrain = ds.MindDataset(path, columns_list, num_consumer) - dtrain = dtrain.shuffle(buffer_size=dtrain.get_dataset_size()) - dtrain = dtrain.batch(batch_size, drop_remainder=True) - dtrain = dtrain.repeat(count=num_epochs) - - return dtrain - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='MindSpore LSTM Example') - parser.add_argument('--preprocess', type=str, default='false', choices=['true', 'false'], - help='Whether to perform data preprocessing') - parser.add_argument('--mode', type=str, default="train", choices=['train', 'test'], - help='implement phase, set to train or test') - # Download dataset from 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz' and extract to 'aclimdb_path' - parser.add_argument('--aclimdb_path', type=str, default="./aclImdb", - help='path where the dataset is store') - # Download glove from 'http://nlp.stanford.edu/data/glove.6B.zip' and extract to 'glove_path' - # Add a new line '400000 300' at the beginning of 'glove.6B.300d.txt' with '40000' for total words and '300' for vector length - parser.add_argument('--glove_path', type=str, default="./glove", - help='path where the glove is store') - parser.add_argument('--preprocess_path', type=str, default="./preprocess", - help='path where the pre-process data is store') - parser.add_argument('--ckpt_path', type=str, default="./ckpt", help='if mode is test, must provide\ - path where the trained ckpt file') - args = parser.parse_args() - - context.set_context( - mode=context.GRAPH_MODE, - save_graphs=False, - device_target="GPU") - - if args.preprocess == 'true': - print("============== Starting Data Pre-processing ==============") - shutil.rmtree(args.preprocess_path) - os.mkdir(args.preprocess_path) - convert_to_mindrecord(cfg.embed_size, args.aclimdb_path, args.preprocess_path, args.glove_path) - - embedding_table = np.loadtxt(os.path.join(args.preprocess_path, "weight.txt")).astype(np.float32) - network = SentimentNet(vocab_size=embedding_table.shape[0], - embed_size=cfg.embed_size, - num_hiddens=cfg.num_hiddens, - num_layers=cfg.num_layers, - bidirectional=cfg.bidirectional, - num_classes=cfg.num_classes, - weight=Tensor(embedding_table), - batch_size=cfg.batch_size) - - loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) - opt = nn.Momentum(network.trainable_params(), cfg.learning_rate, cfg.momentum) - loss_cb = LossMonitor() - model = Model(network, loss, opt, {'acc': Accuracy()}) - - if args.mode == 'train': - print("============== Starting Training ==============") - ds_train = create_dataset(args.preprocess_path, cfg.batch_size, cfg.num_epochs, True) - config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, - keep_checkpoint_max=cfg.keep_checkpoint_max) - ckpoint_cb = ModelCheckpoint(prefix="lstm", directory=args.ckpt_path, config=config_ck) - model.train(cfg.num_epochs, ds_train, callbacks=[ckpoint_cb, loss_cb]) - elif args.mode == 'test': - print("============== Starting Testing ==============") - ds_eval = create_dataset(args.preprocess_path, cfg.batch_size, 1, False) - param_dict = load_checkpoint(args.ckpt_path) - load_param_into_net(network, param_dict) - acc = model.eval(ds_eval) - print("============== Accuracy:{} ==============".format(acc)) - else: - raise RuntimeError('mode should be train or test, rather than {}'.format(args.mode)) diff --git a/chapter06/lstm/src/__init__.py b/chapter06/lstm/src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..301ef9dcb71d51fb0b849da4c221c67947ab09df --- /dev/null +++ b/chapter06/lstm/src/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the License); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# httpwww.apache.orglicensesLICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ diff --git a/chapter06/lstm/config.py b/chapter06/lstm/src/config.py similarity index 95% rename from chapter06/lstm/config.py rename to chapter06/lstm/src/config.py index 0ae2d048be6ab5d9176c70c6210539775ad6507a..688760111c5d93f8414c61bae8bab7d28d4c08e4 100644 --- a/chapter06/lstm/config.py +++ b/chapter06/lstm/src/config.py @@ -13,7 +13,7 @@ # limitations under the License. # ============================================================================ """ -network config +network config setting """ from easydict import EasyDict as edict @@ -22,7 +22,7 @@ lstm_cfg = edict({ 'num_classes': 2, 'learning_rate': 0.1, 'momentum': 0.9, - 'num_epochs': 1, + 'num_epochs': 20, 'batch_size': 64, 'embed_size': 300, 'num_hiddens': 100, diff --git a/chapter06/lstm/src/dataset.py b/chapter06/lstm/src/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..03d4276dfd08e5134ea937c3a7613c5123100c31 --- /dev/null +++ b/chapter06/lstm/src/dataset.py @@ -0,0 +1,92 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +Data operations, will be used in train.py and eval.py +""" +import os + +import numpy as np + +import mindspore.dataset as ds +from mindspore.mindrecord import FileWriter +from .imdb import ImdbParser + + +def lstm_create_dataset(data_home, batch_size, repeat_num=1, training=True): + """Data operations.""" + ds.config.set_seed(1) + data_dir = os.path.join(data_home, "aclImdb_train.mindrecord0") + if not training: + data_dir = os.path.join(data_home, "aclImdb_test.mindrecord0") + + data_set = ds.MindDataset(data_dir, columns_list=["feature", "label"], num_parallel_workers=4) + + # apply map operations on images + data_set = data_set.shuffle(buffer_size=data_set.get_dataset_size()) + data_set = data_set.batch(batch_size=batch_size, drop_remainder=True) + data_set = data_set.repeat(count=repeat_num) + + return data_set + + +def _convert_to_mindrecord(data_home, features, labels, weight_np=None, training=True): + """ + convert imdb dataset to mindrecoed dataset + """ + if weight_np is not None: + np.savetxt(os.path.join(data_home, 'weight.txt'), weight_np) + + # write mindrecord + schema_json = {"id": {"type": "int32"}, + "label": {"type": "int32"}, + "feature": {"type": "int32", "shape": [-1]}} + + data_dir = os.path.join(data_home, "aclImdb_train.mindrecord") + if not training: + data_dir = os.path.join(data_home, "aclImdb_test.mindrecord") + + def get_imdb_data(features, labels): + data_list = [] + for i, (label, feature) in enumerate(zip(labels, features)): + data_json = {"id": i, + "label": int(label), + "feature": feature.reshape(-1)} + data_list.append(data_json) + return data_list + + writer = FileWriter(data_dir, shard_num=4) + data = get_imdb_data(features, labels) + writer.add_schema(schema_json, "nlp_schema") + writer.add_index(["id", "label"]) + writer.write_raw_data(data) + writer.commit() + + +def convert_to_mindrecord(embed_size, aclimdb_path, preprocess_path, glove_path): + """ + convert imdb dataset to mindrecoed dataset + """ + parser = ImdbParser(aclimdb_path, glove_path, embed_size) + parser.parse() + + if not os.path.exists(preprocess_path): + print(f"preprocess path {preprocess_path} is not exist") + os.makedirs(preprocess_path) + + train_features, train_labels, train_weight_np = parser.get_datas('train') + _convert_to_mindrecord(preprocess_path, train_features, train_labels, train_weight_np) + + test_features, test_labels, _ = parser.get_datas('test') + _convert_to_mindrecord(preprocess_path, test_features, test_labels, training=False) diff --git a/chapter06/lstm/src/imdb.py b/chapter06/lstm/src/imdb.py new file mode 100644 index 0000000000000000000000000000000000000000..9888b4c36fdd7625a8c28036a4375138f728a975 --- /dev/null +++ b/chapter06/lstm/src/imdb.py @@ -0,0 +1,155 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +imdb dataset parser. +""" +import os +from itertools import chain + +import numpy as np +import gensim + + +class ImdbParser(): + """ + parse aclImdb data to features and labels. + sentence->tokenized->encoded->padding->features + """ + + def __init__(self, imdb_path, glove_path, embed_size=300): + self.__segs = ['train', 'test'] + self.__label_dic = {'pos': 1, 'neg': 0} + self.__imdb_path = imdb_path + self.__glove_dim = embed_size + self.__glove_file = os.path.join(glove_path, 'glove.6B.' + str(self.__glove_dim) + 'd.txt') + + # properties + self.__imdb_datas = {} + self.__features = {} + self.__labels = {} + self.__vacab = {} + self.__word2idx = {} + self.__weight_np = {} + self.__wvmodel = None + + def parse(self): + """ + parse imdb data to memory + """ + self.__wvmodel = gensim.models.KeyedVectors.load_word2vec_format(self.__glove_file) + + for seg in self.__segs: + self.__parse_imdb_datas(seg) + self.__parse_features_and_labels(seg) + self.__gen_weight_np(seg) + + def __parse_imdb_datas(self, seg): + """ + load data from txt + """ + data_lists = [] + for label_name, label_id in self.__label_dic.items(): + sentence_dir = os.path.join(self.__imdb_path, seg, label_name) + for file in os.listdir(sentence_dir): + with open(os.path.join(sentence_dir, file), mode='r', encoding='utf8') as f: + sentence = f.read().replace('\n', '') + data_lists.append([sentence, label_id]) + self.__imdb_datas[seg] = data_lists + + def __parse_features_and_labels(self, seg): + """ + parse features and labels + """ + features = [] + labels = [] + for sentence, label in self.__imdb_datas[seg]: + features.append(sentence) + labels.append(label) + + self.__features[seg] = features + self.__labels[seg] = labels + + # update feature to tokenized + self.__updata_features_to_tokenized(seg) + # parse vacab + self.__parse_vacab(seg) + # encode feature + self.__encode_features(seg) + # padding feature + self.__padding_features(seg) + + def __updata_features_to_tokenized(self, seg): + tokenized_features = [] + for sentence in self.__features[seg]: + tokenized_sentence = [word.lower() for word in sentence.split(" ")] + tokenized_features.append(tokenized_sentence) + self.__features[seg] = tokenized_features + + def __parse_vacab(self, seg): + # vocab + tokenized_features = self.__features[seg] + vocab = set(chain(*tokenized_features)) + self.__vacab[seg] = vocab + + # word_to_idx: {'hello': 1, 'world':111, ... '': 0} + word_to_idx = {word: i + 1 for i, word in enumerate(vocab)} + word_to_idx[''] = 0 + self.__word2idx[seg] = word_to_idx + + def __encode_features(self, seg): + """ encode word to index """ + word_to_idx = self.__word2idx['train'] + encoded_features = [] + for tokenized_sentence in self.__features[seg]: + encoded_sentence = [] + for word in tokenized_sentence: + encoded_sentence.append(word_to_idx.get(word, 0)) + encoded_features.append(encoded_sentence) + self.__features[seg] = encoded_features + + def __padding_features(self, seg, maxlen=500, pad=0): + """ pad all features to the same length """ + padded_features = [] + for feature in self.__features[seg]: + if len(feature) >= maxlen: + padded_feature = feature[:maxlen] + else: + padded_feature = feature + while len(padded_feature) < maxlen: + padded_feature.append(pad) + padded_features.append(padded_feature) + self.__features[seg] = padded_features + + def __gen_weight_np(self, seg): + """ + generate weight by gensim + """ + weight_np = np.zeros((len(self.__word2idx[seg]), self.__glove_dim), dtype=np.float32) + for word, idx in self.__word2idx[seg].items(): + if word not in self.__wvmodel: + continue + word_vector = self.__wvmodel.get_vector(word) + weight_np[idx, :] = word_vector + + self.__weight_np[seg] = weight_np + + def get_datas(self, seg): + """ + return features, labels, and weight + """ + features = np.array(self.__features[seg]).astype(np.int32) + labels = np.array(self.__labels[seg]).astype(np.int32) + weight = np.array(self.__weight_np[seg]) + return features, labels, weight diff --git a/chapter06/lstm/src/lstm.py b/chapter06/lstm/src/lstm.py new file mode 100644 index 0000000000000000000000000000000000000000..c3ca0bbf7c9f4cc5d3fb09f24ce7b7bff10502ce --- /dev/null +++ b/chapter06/lstm/src/lstm.py @@ -0,0 +1,93 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""LSTM.""" + +import numpy as np + +from mindspore import Tensor, nn, context +from mindspore.ops import operations as P + +# Initialize short-term memory (h) and long-term memory (c) to 0 +def lstm_default_state(batch_size, hidden_size, num_layers, bidirectional): + """init default input.""" + num_directions = 1 + if bidirectional: + num_directions = 2 + + if context.get_context("device_target") == "CPU": + h_list = [] + c_list = [] + i = 0 + while i < num_layers: + hi = Tensor(np.zeros((num_directions, batch_size, hidden_size)).astype(np.float32)) + h_list.append(hi) + ci = Tensor(np.zeros((num_directions, batch_size, hidden_size)).astype(np.float32)) + c_list.append(ci) + i = i + 1 + h = tuple(h_list) + c = tuple(c_list) + return h, c + + h = Tensor( + np.zeros((num_layers * num_directions, batch_size, hidden_size)).astype(np.float32)) + c = Tensor( + np.zeros((num_layers * num_directions, batch_size, hidden_size)).astype(np.float32)) + return h, c + + +class SentimentNet(nn.Cell): + """Sentiment network structure.""" + + def __init__(self, + vocab_size, + embed_size, + num_hiddens, + num_layers, + bidirectional, + num_classes, + weight, + batch_size): + super(SentimentNet, self).__init__() + # Mapp words to vectors + self.embedding = nn.Embedding(vocab_size, + embed_size, + embedding_table=weight) + self.embedding.embedding_table.requires_grad = False + self.trans = P.Transpose() + self.perm = (1, 0, 2) + self.encoder = nn.LSTM(input_size=embed_size, + hidden_size=num_hiddens, + num_layers=num_layers, + has_bias=True, + bidirectional=bidirectional, + dropout=0.0) + + self.h, self.c = lstm_default_state(batch_size, num_hiddens, num_layers, bidirectional) + + self.concat = P.Concat(1) + if bidirectional: + self.decoder = nn.Dense(num_hiddens * 4, num_classes) + else: + self.decoder = nn.Dense(num_hiddens * 2, num_classes) + + def construct(self, inputs): + # input:(64,500,300) + embeddings = self.embedding(inputs) + embeddings = self.trans(embeddings, self.perm) + output, _ = self.encoder(embeddings, (self.h, self.c)) + # states[i] size(64,200) -> encoding.size(64,400) + encoding = self.concat((output[0], output[499])) + outputs = self.decoder(encoding) + return outputs diff --git a/chapter06/lstm/train.py b/chapter06/lstm/train.py new file mode 100644 index 0000000000000000000000000000000000000000..51ae12c6854fd01b8b4877bc9b1e3acb875861a3 --- /dev/null +++ b/chapter06/lstm/train.py @@ -0,0 +1,89 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +#################train lstm example on aclImdb######################## +python train.py --preprocess=true --aclimdb_path=your_imdb_path --glove_path=your_glove_path +""" +import argparse +import os + +import numpy as np + +from src.config import lstm_cfg as cfg +from src.dataset import convert_to_mindrecord +from src.dataset import lstm_create_dataset +from src.lstm import SentimentNet +from mindspore import Tensor, nn, Model, context +from mindspore.nn import Accuracy +from mindspore.train.callback import LossMonitor, CheckpointConfig, ModelCheckpoint, TimeMonitor +from mindspore.train.serialization import load_param_into_net, load_checkpoint + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='MindSpore LSTM Example') + parser.add_argument('--preprocess', type=str, default='false', choices=['true', 'false'], + help='whether to preprocess data.') + parser.add_argument('--aclimdb_path', type=str, default="./aclImdb", + help='path where the dataset is stored.') + parser.add_argument('--glove_path', type=str, default="./glove", + help='path where the GloVe is stored.') + parser.add_argument('--preprocess_path', type=str, default="./preprocess", + help='path where the pre-process data is stored.') + parser.add_argument('--ckpt_path', type=str, default="./", + help='the path to save the checkpoint file.') + parser.add_argument('--pre_trained', type=str, default=None, + help='the pretrained checkpoint file path.') + parser.add_argument('--device_target', type=str, default="GPU", choices=['GPU', 'CPU'], + help='the target device to run, support "GPU", "CPU". Default: "GPU".') + args = parser.parse_args() + + context.set_context( + mode=context.GRAPH_MODE, + save_graphs=False, + device_target=args.device_target) + + if args.preprocess == "true": + print("============== Starting Data Pre-processing ==============") + convert_to_mindrecord(cfg.embed_size, args.aclimdb_path, args.preprocess_path, args.glove_path) + + embedding_table = np.loadtxt(os.path.join(args.preprocess_path, "weight.txt")).astype(np.float32) + network = SentimentNet(vocab_size=embedding_table.shape[0], + embed_size=cfg.embed_size, + num_hiddens=cfg.num_hiddens, + num_layers=cfg.num_layers, + bidirectional=cfg.bidirectional, + num_classes=cfg.num_classes, + weight=Tensor(embedding_table), + batch_size=cfg.batch_size) + # pre_trained + if args.pre_trained: + load_param_into_net(network, load_checkpoint(args.pre_trained)) + + loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) + opt = nn.Momentum(network.trainable_params(), cfg.learning_rate, cfg.momentum) + loss_cb = LossMonitor() + + model = Model(network, loss, opt, {'acc': Accuracy()}) + + print("============== Starting Training ==============") + ds_train = lstm_create_dataset(args.preprocess_path, cfg.batch_size, cfg.num_epochs) + config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + ckpoint_cb = ModelCheckpoint(prefix="lstm", directory=args.ckpt_path, config=config_ck) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + if args.device_target == "CPU": + model.train(cfg.num_epochs, ds_train, callbacks=[time_cb, ckpoint_cb, loss_cb], dataset_sink_mode=False) + else: + model.train(cfg.num_epochs, ds_train, callbacks=[time_cb, ckpoint_cb, loss_cb]) + print("============== Training Success ==============")