From 1b8d2e794b32039aa7ecc6367dabb64a3e5e6467 Mon Sep 17 00:00:00 2001 From: blue-fish <67130644+blue-fish@users.noreply.github.com> Date: Mon, 22 Jun 2020 09:44:36 -0700 Subject: [PATCH] Cleanup pr 331 (#366) [#366] Add CPU support. Also some updates for tensorflow v2 compatibility (in work) Co-authored-by: pusalieth --- demo_cli.py | 41 +++++++++-------- encoder/inference.py | 2 +- encoder/train.py | 6 +-- requirements.txt | 6 ++- synthesizer/feeder.py | 18 ++++---- synthesizer/inference.py | 5 +-- synthesizer/models/attention.py | 8 ++-- synthesizer/models/helpers.py | 2 +- synthesizer/models/modules.py | 70 +++++++++++++++-------------- synthesizer/models/tacotron.py | 60 ++++++++++++------------- synthesizer/tacotron2.py | 20 ++++----- synthesizer/train.py | 72 +++++++++++++++--------------- vocoder/inference.py | 12 +++-- vocoder/models/fatchord_version.py | 26 ++++++++--- 14 files changed, 188 insertions(+), 160 deletions(-) diff --git a/demo_cli.py b/demo_cli.py index 57bb001..1fb2df4 100644 --- a/demo_cli.py +++ b/demo_cli.py @@ -5,6 +5,7 @@ from encoder import inference as encoder from vocoder import inference as vocoder from pathlib import Path import numpy as np +import soundfile as sf import librosa import argparse import torch @@ -30,6 +31,7 @@ if __name__ == '__main__': "overhead but allows to save some GPU memory for lower-end GPUs.") parser.add_argument("--no_sound", action="store_true", help=\ "If True, audio won't be played.") + parser.add_argument("--cpu", help="Use CPU.", action="store_true") args = parser.parse_args() print_args(args, parser) if not args.no_sound: @@ -38,22 +40,25 @@ if __name__ == '__main__': ## Print some environment information (for debugging purposes) print("Running a test of your configuration...\n") - if not torch.cuda.is_available(): - print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready " + if args.cpu: + print("Using CPU for inference.") + elif torch.cuda.is_available(): + device_id = torch.cuda.current_device() + gpu_properties = torch.cuda.get_device_properties(device_id) + print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " + "%.1fGb total memory.\n" % + (torch.cuda.device_count(), + device_id, + gpu_properties.name, + gpu_properties.major, + gpu_properties.minor, + gpu_properties.total_memory / 1e9)) + else: + print("Your PyTorch installation is not configured. If you have a GPU ready " "for deep learning, ensure that the drivers are properly installed, and that your " - "CUDA version matches your PyTorch installation. CPU-only inference is currently " - "not supported.", file=sys.stderr) + "CUDA version matches your PyTorch installation.", file=sys.stderr) + print("\nIf you're trying to use a cpu, please use the option --cpu.", file=sys.stderr) quit(-1) - device_id = torch.cuda.current_device() - gpu_properties = torch.cuda.get_device_properties(device_id) - print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " - "%.1fGb total memory.\n" % - (torch.cuda.device_count(), - device_id, - gpu_properties.name, - gpu_properties.major, - gpu_properties.minor, - gpu_properties.total_memory / 1e9)) ## Load the models one by one. @@ -172,15 +177,13 @@ if __name__ == '__main__': sd.play(generated_wav, synthesizer.sample_rate) # Save it on the disk - fpath = "demo_output_%02d.wav" % num_generated + filename = "demo_output_%02d.wav" % num_generated print(generated_wav.dtype) - librosa.output.write_wav(fpath, generated_wav.astype(np.float32), - synthesizer.sample_rate) + sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate) num_generated += 1 - print("\nSaved output as %s\n\n" % fpath) + print("\nSaved output as %s\n\n" % filename) except Exception as e: print("Caught exception: %s" % repr(e)) print("Restarting\n") - \ No newline at end of file diff --git a/encoder/inference.py b/encoder/inference.py index 2447832..d769dd1 100644 --- a/encoder/inference.py +++ b/encoder/inference.py @@ -30,7 +30,7 @@ def load_model(weights_fpath: Path, device=None): elif isinstance(device, str): _device = torch.device(device) _model = SpeakerEncoder(_device, torch.device("cpu")) - checkpoint = torch.load(weights_fpath) + checkpoint = torch.load(weights_fpath, _device) _model.load_state_dict(checkpoint["model_state"]) _model.eval() print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"])) diff --git a/encoder/train.py b/encoder/train.py index 071af1b..ee65418 100644 --- a/encoder/train.py +++ b/encoder/train.py @@ -7,11 +7,12 @@ from pathlib import Path import torch def sync(device: torch.device): - # FIXME - return # For correct profiling (cuda operations are async) if device.type == "cuda": torch.cuda.synchronize(device) + else: + torch.cpu.synchronize(device) + def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int, backup_every: int, vis_every: int, force_restart: bool, visdom_server: str, @@ -122,4 +123,3 @@ def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, }, backup_fpath) profiler.tick("Extras (visualizations, saving)") - \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4b54673..c0fdf41 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,8 @@ -tensorflow-gpu>=1.10.0,<=1.14.0 +# each portion of tensorflow is needed +# core package is for RNN, cpu and gpu are for specific system speed-ups +tensorflow==1.15 +tensorflow-cpu==1.15 +tensorflow-gpu==1.15 umap-learn visdom webrtcvad diff --git a/synthesizer/feeder.py b/synthesizer/feeder.py index 6fc1b20..b1acb3d 100644 --- a/synthesizer/feeder.py +++ b/synthesizer/feeder.py @@ -70,22 +70,22 @@ class Feeder: # Create placeholders for inputs and targets. Don"t specify batch size because we want # to be able to feed different batch sizes at eval time. self._placeholders = [ - tf.placeholder(tf.int32, shape=(None, None), name="inputs"), - tf.placeholder(tf.int32, shape=(None, ), name="input_lengths"), - tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), + tf.compat.v1.placeholder(tf.int32, shape=(None, None), name="inputs"), + tf.compat.v1.placeholder(tf.int32, shape=(None, ), name="input_lengths"), + tf.compat.v1.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name="mel_targets"), - tf.placeholder(tf.float32, shape=(None, None), name="token_targets"), - tf.placeholder(tf.int32, shape=(None, ), name="targets_lengths"), - tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), + tf.compat.v1.placeholder(tf.float32, shape=(None, None), name="token_targets"), + tf.compat.v1.placeholder(tf.int32, shape=(None, ), name="targets_lengths"), + tf.compat.v1.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos"), # SV2TTS - tf.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size), + tf.compat.v1.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size), name="speaker_embeddings") ] # Create queue for buffering data - queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, + queue = tf.queue.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, tf.int32, tf.int32, tf.float32], name="input_queue") self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.mel_targets, self.token_targets, \ @@ -100,7 +100,7 @@ class Feeder: self.speaker_embeddings.set_shape(self._placeholders[6].shape) # Create eval queue for buffering eval data - eval_queue = tf.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32, + eval_queue = tf.queue.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32, tf.int32, tf.int32, tf.float32], name="eval_queue") self._eval_enqueue_op = eval_queue.enqueue(self._placeholders) self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, \ diff --git a/synthesizer/inference.py b/synthesizer/inference.py index 99fb778..86bd2fb 100644 --- a/synthesizer/inference.py +++ b/synthesizer/inference.py @@ -54,7 +54,7 @@ class Synthesizer: """ if self._low_mem: raise Exception("Cannot load the synthesizer permanently in low mem mode") - tf.reset_default_graph() + tf.compat.v1.reset_default_graph() self._model = Tacotron2(self.checkpoint_fpath, hparams) def synthesize_spectrograms(self, texts: List[str], @@ -88,7 +88,7 @@ class Synthesizer: @staticmethod def _one_shot_synthesize_spectrograms(checkpoint_fpath, embeddings, texts): # Load the model and forward the inputs - tf.reset_default_graph() + tf.compat.v1.reset_default_graph() model = Tacotron2(checkpoint_fpath, hparams) specs, alignments = model.my_synthesize(embeddings, texts) @@ -134,4 +134,3 @@ class Synthesizer: with the same parameters present in hparams.py. """ return audio.inv_mel_spectrogram(mel, hparams) - \ No newline at end of file diff --git a/synthesizer/models/attention.py b/synthesizer/models/attention.py index 58892ad..1f40d45 100644 --- a/synthesizer/models/attention.py +++ b/synthesizer/models/attention.py @@ -60,10 +60,10 @@ def _location_sensitive_score(W_query, W_fil, W_keys): dtype = W_query.dtype num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1] - v_a = tf.get_variable( + v_a = tf.compat.v1.get_variable( "attention_variable_projection", shape=[num_units], dtype=dtype, initializer=tf.contrib.layers.xavier_initializer()) - b_a = tf.get_variable( + b_a = tf.compat.v1.get_variable( "attention_bias", shape=[num_units], dtype=dtype, initializer=tf.zeros_initializer()) @@ -155,10 +155,10 @@ class LocationSensitiveAttention(BahdanauAttention): probability_fn=normalization_function, name=name) - self.location_convolution = tf.layers.Conv1D(filters=hparams.attention_filters, + self.location_convolution = tf.compat.v1.layers.Conv1D(filters=hparams.attention_filters, kernel_size=hparams.attention_kernel, padding="same", use_bias=True, bias_initializer=tf.zeros_initializer(), name="location_features_convolution") - self.location_layer = tf.layers.Dense(units=num_units, use_bias=False, + self.location_layer = tf.compat.v1.layers.Dense(units=num_units, use_bias=False, dtype=tf.float32, name="location_features_layer") self._cumulate = cumulate_weights diff --git a/synthesizer/models/helpers.py b/synthesizer/models/helpers.py index eec0699..4e58ccd 100644 --- a/synthesizer/models/helpers.py +++ b/synthesizer/models/helpers.py @@ -119,7 +119,7 @@ class TacoTrainingHelper(Helper): #Pick previous outputs randomly with respect to teacher forcing ratio next_inputs = tf.cond( - tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio), + tf.less(tf.random.uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio), lambda: self._targets[:, time, :], #Teacher-forcing: return true frame lambda: outputs[:,-self._output_dim:]) diff --git a/synthesizer/models/modules.py b/synthesizer/models/modules.py index 7696572..f9fe7eb 100644 --- a/synthesizer/models/modules.py +++ b/synthesizer/models/modules.py @@ -1,4 +1,5 @@ import tensorflow as tf +import torch class HighwayNet: @@ -6,12 +7,12 @@ class HighwayNet: self.units = units self.scope = "HighwayNet" if name is None else name - self.H_layer = tf.layers.Dense(units=self.units, activation=tf.nn.relu, name="H") - self.T_layer = tf.layers.Dense(units=self.units, activation=tf.nn.sigmoid, name="T", + self.H_layer = tf.compat.v1.layers.Dense(units=self.units, activation=tf.nn.relu, name="H") + self.T_layer = tf.compat.v1.layers.Dense(units=self.units, activation=tf.nn.sigmoid, name="T", bias_initializer=tf.constant_initializer(-1.)) def __call__(self, inputs): - with tf.variable_scope(self.scope): + with tf.compat.v1.variable_scope(self.scope): H = self.H_layer(inputs) T = self.T_layer(inputs) return H * T + inputs * (1. - T) @@ -38,8 +39,8 @@ class CBHG: self._bw_cell = tf.nn.rnn_cell.GRUCell(rnn_units, name="{}_backward_RNN".format(self.scope)) def __call__(self, inputs, input_lengths): - with tf.variable_scope(self.scope): - with tf.variable_scope("conv_bank"): + with tf.compat.v1.variable_scope(self.scope): + with tf.compat.v1.variable_scope("conv_bank"): # Convolution bank: concatenate on the last axis to stack channels from all # convolutions # The convolution bank uses multiple different kernel sizes to have many insights @@ -71,7 +72,7 @@ class CBHG: # Additional projection in case of dimension mismatch (for HighwayNet "residual" # connection) if highway_input.shape[2] != self.highway_units: - highway_input = tf.layers.dense(highway_input, self.highway_units) + highway_input = tf.compat.v1.layers.Dense(highway_input, self.highway_units) # 4-layer HighwayNet for highwaynet in self.highwaynet_layers: @@ -88,7 +89,7 @@ class CBHG: return tf.concat(outputs, axis=2) # Concat forward and backward outputs -class ZoneoutLSTMCell(tf.nn.rnn_cell.RNNCell): +class ZoneoutLSTMCell(tf.compat.v1.nn.rnn_cell.RNNCell): """Wrapper for tf LSTM to create Zoneout LSTM Cell inspired by: @@ -108,8 +109,11 @@ class ZoneoutLSTMCell(tf.nn.rnn_cell.RNNCell): if zm < 0. or zs > 1.: raise ValueError("One/both provided Zoneout factors are not in [0, 1]") - - self._cell = tf.nn.rnn_cell.LSTMCell(num_units, state_is_tuple=state_is_tuple, name=name) + + if torch.cuda.is_available(): + self._cell = tf.contrib.cudnn_rnn.CudnnLSTM(num_units, name=name) + else: + self._cell = tf.contrib.rnn.LSTMBlockCell(num_units, name=name) self._zoneout_cell = zoneout_factor_cell self._zoneout_outputs = zoneout_factor_output self.is_training = is_training @@ -144,16 +148,13 @@ class ZoneoutLSTMCell(tf.nn.rnn_cell.RNNCell): if self.is_training: # nn.dropout takes keep_prob (probability to keep activations) not drop_prob ( # probability to mask activations)! - c = (1 - self._zoneout_cell) * tf.nn.dropout(new_c - prev_c, - (1 - self._zoneout_cell)) + prev_c - h = (1 - self._zoneout_outputs) * tf.nn.dropout(new_h - prev_h, - (1 - self._zoneout_outputs)) + prev_h - + c = (1 - self._zoneout_cell) * tf.nn.dropout(new_c - prev_c, (1 - self._zoneout_cell)) + prev_c + h = (1 - self._zoneout_outputs) * tf.nn.dropout(new_h - prev_h, (1 - self._zoneout_outputs)) + prev_h else: c = (1 - self._zoneout_cell) * new_c + self._zoneout_cell * prev_c h = (1 - self._zoneout_outputs) * new_h + self._zoneout_outputs * prev_h - new_state = tf.nn.rnn_cell.LSTMStateTuple(c, h) if self.state_is_tuple else tf.concat(1, [c, + new_state = tf.compat.v1.nn.rnn_cell.LSTMStateTuple(c, h) if self.state_is_tuple else tf.concat(1, [c, h]) return output, new_state @@ -175,7 +176,7 @@ class EncoderConvolutions: """ super(EncoderConvolutions, self).__init__() self.is_training = is_training - + self.kernel_size = hparams.enc_conv_kernel_size self.channels = hparams.enc_conv_channels self.activation = activation @@ -184,7 +185,7 @@ class EncoderConvolutions: self.enc_conv_num_layers = hparams.enc_conv_num_layers def __call__(self, inputs): - with tf.variable_scope(self.scope): + with tf.compat.v1.variable_scope(self.scope): x = inputs for i in range(self.enc_conv_num_layers): x = conv1d(x, self.kernel_size, self.channels, self.activation, @@ -226,8 +227,8 @@ class EncoderRNN: name="encoder_bw_LSTM") def __call__(self, inputs, input_lengths): - with tf.variable_scope(self.scope): - outputs, (fw_state, bw_state) = tf.nn.bidirectional_dynamic_rnn( + with tf.compat.v1.variable_scope(self.scope): + outputs, (fw_state, bw_state) = tf.compat.v1.nn.bidirectional_dynamic_rnn( self._fw_cell, self._bw_cell, inputs, @@ -239,7 +240,8 @@ class EncoderRNN: class Prenet: - """Two fully connected layers used as an information bottleneck for the attention. + """ + Two fully connected layers used as an information bottleneck for the attention. """ def __init__(self, is_training, layers_sizes=[256, 256], drop_rate=0.5, activation=tf.nn.relu, @@ -263,13 +265,13 @@ class Prenet: def __call__(self, inputs): x = inputs - with tf.variable_scope(self.scope): + with tf.compat.v1.variable_scope(self.scope): for i, size in enumerate(self.layers_sizes): - dense = tf.layers.dense(x, units=size, activation=self.activation, + dense = tf.compat.v1.layers.dense(x, units=size, activation=self.activation, name="dense_{}".format(i + 1)) # The paper discussed introducing diversity in generation at inference time # by using a dropout of 0.5 only in prenet layers (in both training and inference). - x = tf.layers.dropout(dense, rate=self.drop_rate, training=True, + x = tf.compat.v1.layers.dropout(dense, rate=self.drop_rate, training=True, name="dropout_{}".format(i + 1) + self.scope) return x @@ -302,10 +304,10 @@ class DecoderRNN: name="decoder_LSTM_{}".format(i + 1)) for i in range(layers)] - self._cell = tf.contrib.rnn.MultiRNNCell(self.rnn_layers, state_is_tuple=True) + self._cell = tf.compat.v1.nn.rnn_cell.MultiRNNCell(self.rnn_layers, state_is_tuple=True) def __call__(self, inputs, states): - with tf.variable_scope(self.scope): + with tf.compat.v1.variable_scope(self.scope): return self._cell(inputs, states) @@ -327,14 +329,14 @@ class FrameProjection: self.activation = activation self.scope = "Linear_projection" if scope is None else scope - self.dense = tf.layers.Dense(units=shape, activation=activation, + self.dense = tf.compat.v1.layers.Dense(units=shape, activation=activation, name="projection_{}".format(self.scope)) def __call__(self, inputs): - with tf.variable_scope(self.scope): + with tf.compat.v1.variable_scope(self.scope): # If activation==None, this returns a simple Linear projection # else the projection will be passed through an activation function - # output = tf.layers.dense(inputs, units=self.shape, activation=self.activation, + # output = tf.compat.v1.layers.Dense(inputs, units=self.shape, activation=self.activation, # name="projection_{}".format(self.scope)) output = self.dense(inputs) @@ -362,7 +364,7 @@ class StopProjection: self.scope = "stop_token_projection" if scope is None else scope def __call__(self, inputs): - with tf.variable_scope(self.scope): + with tf.compat.v1.variable_scope(self.scope): output = tf.layers.dense(inputs, units=self.shape, activation=None, name="projection_{}".format(self.scope)) @@ -399,7 +401,7 @@ class Postnet: self.drop_rate = hparams.tacotron_dropout_rate def __call__(self, inputs): - with tf.variable_scope(self.scope): + with tf.compat.v1.variable_scope(self.scope): x = inputs for i in range(self.postnet_num_layers - 1): x = conv1d(x, self.kernel_size, self.channels, self.activation, @@ -412,16 +414,16 @@ class Postnet: def conv1d(inputs, kernel_size, channels, activation, is_training, drop_rate, scope): - with tf.variable_scope(scope): - conv1d_output = tf.layers.conv1d( + with tf.compat.v1.variable_scope(scope): + conv1d_output = tf.compat.v1.layers.conv1d( inputs, filters=channels, kernel_size=kernel_size, activation=None, padding="same") - batched = tf.layers.batch_normalization(conv1d_output, training=is_training) + batched = tf.compat.v1.layers.batch_normalization(conv1d_output, training=is_training) activated = activation(batched) - return tf.layers.dropout(activated, rate=drop_rate, training=is_training, + return tf.compat.v1.layers.dropout(activated, rate=drop_rate, training=is_training, name="dropout_{}".format(scope)) diff --git a/synthesizer/models/tacotron.py b/synthesizer/models/tacotron.py index 9c4de4d..8cf2d3d 100644 --- a/synthesizer/models/tacotron.py +++ b/synthesizer/models/tacotron.py @@ -83,11 +83,11 @@ class Tacotron(): ############## - p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) - p_mel_targets = tf.py_func(split_func, [mel_targets, split_infos[:, 1]], - lout_float) if mel_targets is not None else mel_targets - p_stop_token_targets = tf.py_func(split_func, [stop_token_targets, split_infos[:, 2]], - lout_float) if stop_token_targets is not None else \ + p_inputs = tf.numpy_function(split_func, [inputs, split_infos[:, 0]], lout_int) + p_mel_targets = tf.numpy_function(split_func, [mel_targets, split_infos[:, 1]], + lout_float) if mel_targets is not None else mel_targets + p_stop_token_targets = tf.numpy_function(split_func, [stop_token_targets, split_infos[:, 2]], + lout_float) if stop_token_targets is not None else \ stop_token_targets tower_inputs = [] @@ -120,9 +120,9 @@ class Tacotron(): gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)] for i in range(hp.tacotron_num_gpus): - with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", - worker_device=gpus[i])): - with tf.variable_scope("inference") as scope: + with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", + worker_device=gpus[i])): + with tf.compat.v1.variable_scope("inference") as scope: assert hp.tacotron_teacher_forcing_mode in ("constant", "scheduled") if hp.tacotron_teacher_forcing_mode == "scheduled" and is_training: assert global_step is not None @@ -132,7 +132,7 @@ class Tacotron(): post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] - self.embedding_table = tf.get_variable( + self.embedding_table = tf.compat.v1.get_variable( "inputs_embedding", [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, tower_inputs[i]) @@ -283,7 +283,7 @@ class Tacotron(): self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets - self.all_vars = tf.trainable_variables() + self.all_vars = tf.compat.v1.trainable_variables() log("Initialized Tacotron model. Dimensions (? = dynamic shape): ") log(" Train mode: {}".format(is_training)) @@ -331,9 +331,9 @@ class Tacotron(): range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)] for i in range(hp.tacotron_num_gpus): - with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", - worker_device=gpus[i])): - with tf.variable_scope("loss") as scope: + with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", + worker_device=gpus[i])): + with tf.compat.v1.variable_scope("loss") as scope: if hp.mask_decoder: # Compute loss of predictions before postnet before = MaskedMSE(self.tower_mel_targets[i], self.tower_decoder_output[i], @@ -356,11 +356,11 @@ class Tacotron(): linear_loss = 0. else: # Compute loss of predictions before postnet - before = tf.losses.mean_squared_error(self.tower_mel_targets[i], - self.tower_decoder_output[i]) + before = tf.compat.v1.losses.mean_squared_error(self.tower_mel_targets[i], + self.tower_decoder_output[i]) # Compute loss after postnet - after = tf.losses.mean_squared_error(self.tower_mel_targets[i], - self.tower_mel_outputs[i]) + after = tf.compat.v1.losses.mean_squared_error(self.tower_mel_targets[i], + self.tower_mel_outputs[i]) # Compute loss (for learning dynamic generation stop) stop_token_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( labels=self.tower_stop_token_targets[i], @@ -439,7 +439,7 @@ class Tacotron(): grad_device = "/cpu:0" if hp.tacotron_num_gpus > 1 else gpus[0] with tf.device(grad_device): - with tf.variable_scope("optimizer") as scope: + with tf.compat.v1.variable_scope("optimizer") as scope: if hp.tacotron_decay_learning_rate: self.decay_steps = hp.tacotron_decay_steps self.decay_rate = hp.tacotron_decay_rate @@ -448,16 +448,16 @@ class Tacotron(): else: self.learning_rate = tf.convert_to_tensor(hp.tacotron_initial_learning_rate) - optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.tacotron_adam_beta1, - hp.tacotron_adam_beta2, hp.tacotron_adam_epsilon) + optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate, hp.tacotron_adam_beta1, + hp.tacotron_adam_beta2, hp.tacotron_adam_epsilon) # 2. Compute Gradient for i in range(hp.tacotron_num_gpus): # Device placement - with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", - worker_device=gpus[i])): + with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", + worker_device=gpus[i])): # agg_loss += self.tower_loss[i] - with tf.variable_scope("optimizer") as scope: + with tf.compat.v1.variable_scope("optimizer") as scope: gradients = optimizer.compute_gradients(self.tower_loss[i]) tower_gradients.append(gradients) @@ -490,7 +490,7 @@ class Tacotron(): # Add dependency on UPDATE_OPS; otherwise batchnorm won"t work correctly. See: # https://github.com/tensorflow/tensorflow/issues/1122 - with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): + with tf.control_dependencies(tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)): self.optimize = optimizer.apply_gradients(zip(clipped_gradients, vars), global_step=global_step) @@ -510,12 +510,12 @@ class Tacotron(): hp = self._hparams # Compute natural exponential decay - lr = tf.train.exponential_decay(init_lr, - global_step - hp.tacotron_start_decay, - # lr = 1e-3 at step 50k - self.decay_steps, - self.decay_rate, # lr = 1e-5 around step 310k - name="lr_exponential_decay") + lr = tf.compat.v1.train.exponential_decay(init_lr, + global_step - hp.tacotron_start_decay, + # lr = 1e-3 at step 50k + self.decay_steps, + self.decay_rate, # lr = 1e-5 around step 310k + name="lr_exponential_decay") # clip learning rate by max and min values (initial and final values) return tf.minimum(tf.maximum(lr, hp.tacotron_final_learning_rate), init_lr) diff --git a/synthesizer/tacotron2.py b/synthesizer/tacotron2.py index 4a5b199..e4c6850 100644 --- a/synthesizer/tacotron2.py +++ b/synthesizer/tacotron2.py @@ -12,13 +12,13 @@ class Tacotron2: def __init__(self, checkpoint_path, hparams, gta=False, model_name="Tacotron"): log("Constructing model: %s" % model_name) #Force the batch size to be known in order to use attention masking in batch synthesis - inputs = tf.placeholder(tf.int32, (None, None), name="inputs") - input_lengths = tf.placeholder(tf.int32, (None,), name="input_lengths") - speaker_embeddings = tf.placeholder(tf.float32, (None, hparams.speaker_embedding_size), + inputs = tf.compat.v1.placeholder(tf.int32, (None, None), name="inputs") + input_lengths = tf.compat.v1.placeholder(tf.int32, (None,), name="input_lengths") + speaker_embeddings = tf.compat.v1.placeholder(tf.float32, (None, hparams.speaker_embedding_size), name="speaker_embeddings") - targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name="mel_targets") - split_infos = tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos") - with tf.variable_scope("Tacotron_model") as scope: + targets = tf.compat.v1.placeholder(tf.float32, (None, None, hparams.num_mels), name="mel_targets") + split_infos = tf.compat.v1.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos") + with tf.compat.v1.variable_scope("Tacotron_model") as scope: self.model = create_model(model_name, hparams) if gta: self.model.initialize(inputs, input_lengths, speaker_embeddings, targets, gta=gta, @@ -52,14 +52,14 @@ class Tacotron2: log("Loading checkpoint: %s" % checkpoint_path) #Memory allocation on the GPUs as needed - config = tf.ConfigProto() + config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True - self.session = tf.Session(config=config) - self.session.run(tf.global_variables_initializer()) + self.session = tf.compat.v1.Session(config=config) + self.session.run(tf.compat.v1.global_variables_initializer()) - saver = tf.train.Saver() + saver = tf.compat.v1.train.Saver() saver.restore(self.session, checkpoint_path) def my_synthesize(self, speaker_embeds, texts): diff --git a/synthesizer/train.py b/synthesizer/train.py index 4fe6bbd..416c4a9 100644 --- a/synthesizer/train.py +++ b/synthesizer/train.py @@ -33,48 +33,48 @@ def add_embedding_stats(summary_writer, embedding_names, paths_to_meta, checkpoi def add_train_stats(model, hparams): - with tf.variable_scope("stats") as scope: + with tf.compat.v1.variable_scope("stats") as scope: for i in range(hparams.tacotron_num_gpus): - tf.summary.histogram("mel_outputs %d" % i, model.tower_mel_outputs[i]) - tf.summary.histogram("mel_targets %d" % i, model.tower_mel_targets[i]) - tf.summary.scalar("before_loss", model.before_loss) - tf.summary.scalar("after_loss", model.after_loss) + tf.compat.v1.summary.histogram("mel_outputs %d" % i, model.tower_mel_outputs[i]) + tf.compat.v1.summary.histogram("mel_targets %d" % i, model.tower_mel_targets[i]) + tf.compat.v1.summary.scalar("before_loss", model.before_loss) + tf.compat.v1.summary.scalar("after_loss", model.after_loss) if hparams.predict_linear: - tf.summary.scalar("linear_loss", model.linear_loss) + tf.compat.v1.summary.scalar("linear_loss", model.linear_loss) for i in range(hparams.tacotron_num_gpus): - tf.summary.histogram("mel_outputs %d" % i, model.tower_linear_outputs[i]) - tf.summary.histogram("mel_targets %d" % i, model.tower_linear_targets[i]) + tf.compat.v1.summary.histogram("mel_outputs %d" % i, model.tower_linear_outputs[i]) + tf.compat.v1.summary.histogram("mel_targets %d" % i, model.tower_linear_targets[i]) - tf.summary.scalar("regularization_loss", model.regularization_loss) - tf.summary.scalar("stop_token_loss", model.stop_token_loss) - tf.summary.scalar("loss", model.loss) - tf.summary.scalar("learning_rate", model.learning_rate) # Control learning rate decay speed + tf.compat.v1.summary.scalar("regularization_loss", model.regularization_loss) + tf.compat.v1.summary.scalar("stop_token_loss", model.stop_token_loss) + tf.compat.v1.summary.scalar("loss", model.loss) + tf.compat.v1.summary.scalar("learning_rate", model.learning_rate) # Control learning rate decay speed if hparams.tacotron_teacher_forcing_mode == "scheduled": - tf.summary.scalar("teacher_forcing_ratio", model.ratio) # Control teacher forcing + tf.compat.v1.summary.scalar("teacher_forcing_ratio", model.ratio) # Control teacher forcing # ratio decay when mode = "scheduled" - gradient_norms = [tf.norm(grad) for grad in model.gradients] - tf.summary.histogram("gradient_norm", gradient_norms) - tf.summary.scalar("max_gradient_norm", tf.reduce_max(gradient_norms)) # visualize + gradient_norms = [tf.norm(tensor=grad) for grad in model.gradients] + tf.compat.v1.summary.histogram("gradient_norm", gradient_norms) + tf.compat.v1.summary.scalar("max_gradient_norm", tf.reduce_max(input_tensor=gradient_norms)) # visualize # gradients (in case of explosion) - return tf.summary.merge_all() + return tf.compat.v1.summary.merge_all() def add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, loss): values = [ - tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_before_loss", - simple_value=before_loss), - tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_after_loss", - simple_value=after_loss), - tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/stop_token_loss", - simple_value=stop_token_loss), - tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_loss", simple_value=loss), + tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_before_loss", + simple_value=before_loss), + tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_after_loss", + simple_value=after_loss), + tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/stop_token_loss", + simple_value=stop_token_loss), + tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_loss", simple_value=loss), ] if linear_loss is not None: - values.append(tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_linear_loss", - simple_value=linear_loss)) - test_summary = tf.Summary(value=values) + values.append(tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_linear_loss", + simple_value=linear_loss)) + test_summary = tf.compat.v1.Summary(value=values) summary_writer.add_summary(test_summary, step) @@ -83,7 +83,7 @@ def time_string(): def model_train_mode(args, feeder, hparams, global_step): - with tf.variable_scope("Tacotron_model", reuse=tf.AUTO_REUSE) as scope: + with tf.compat.v1.variable_scope("Tacotron_model", reuse=tf.compat.v1.AUTO_REUSE) as scope: model = create_model("Tacotron", hparams) model.initialize(feeder.inputs, feeder.input_lengths, feeder.speaker_embeddings, feeder.mel_targets, feeder.token_targets, @@ -96,7 +96,7 @@ def model_train_mode(args, feeder, hparams, global_step): def model_test_mode(args, feeder, hparams, global_step): - with tf.variable_scope("Tacotron_model", reuse=tf.AUTO_REUSE) as scope: + with tf.compat.v1.variable_scope("Tacotron_model", reuse=tf.compat.v1.AUTO_REUSE) as scope: model = create_model("Tacotron", hparams) model.initialize(feeder.eval_inputs, feeder.eval_input_lengths, feeder.eval_speaker_embeddings, feeder.eval_mel_targets, @@ -136,11 +136,11 @@ def train(log_dir, args, hparams): log(hparams_debug_string()) # Start by setting a seed for repeatability - tf.set_random_seed(hparams.tacotron_random_seed) + tf.compat.v1.set_random_seed(hparams.tacotron_random_seed) # Set up data feeder coord = tf.train.Coordinator() - with tf.variable_scope("datafeeder") as scope: + with tf.compat.v1.variable_scope("datafeeder") as scope: feeder = Feeder(coord, metadat_fpath, hparams) # Set up model: @@ -164,21 +164,21 @@ def train(log_dir, args, hparams): step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) - saver = tf.train.Saver(max_to_keep=5) + saver = tf.compat.v1.train.Saver(max_to_keep=5) log("Tacotron training set to a maximum of {} steps".format(args.tacotron_train_steps)) # Memory allocation on the GPU as needed - config = tf.ConfigProto() + config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True # Train - with tf.Session(config=config) as sess: + with tf.compat.v1.Session(config=config) as sess: try: - summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) + summary_writer = tf.compat.v1.summary.FileWriter(tensorboard_dir, sess.graph) - sess.run(tf.global_variables_initializer()) + sess.run(tf.compat.v1.global_variables_initializer()) # saved model restoring if args.restore: diff --git a/vocoder/inference.py b/vocoder/inference.py index 19e639c..7e54684 100644 --- a/vocoder/inference.py +++ b/vocoder/inference.py @@ -6,7 +6,7 @@ import torch _model = None # type: WaveRNN def load_model(weights_fpath, verbose=True): - global _model + global _model, _device if verbose: print("Building Wave-RNN") @@ -23,11 +23,17 @@ def load_model(weights_fpath, verbose=True): hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode - ).cuda() + ) + + if torch.cuda.is_available(): + _model = _model.cuda() + _device = torch.device('cuda') + else: + _device = torch.device('cpu') if verbose: print("Loading model weights at %s" % weights_fpath) - checkpoint = torch.load(weights_fpath) + checkpoint = torch.load(weights_fpath, _device) _model.load_state_dict(checkpoint['model_state']) _model.eval() diff --git a/vocoder/models/fatchord_version.py b/vocoder/models/fatchord_version.py index 798842d..429572b 100644 --- a/vocoder/models/fatchord_version.py +++ b/vocoder/models/fatchord_version.py @@ -157,7 +157,10 @@ class WaveRNN(nn.Module): rnn2 = self.get_gru_cell(self.rnn2) with torch.no_grad(): - mels = mels.cuda() + if torch.cuda.is_available(): + mels = mels.cuda() + else: + mels = mels.cpu() wave_len = (mels.size(-1) - 1) * self.hop_length mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side='both') mels, aux = self.upsample(mels.transpose(1, 2)) @@ -168,9 +171,14 @@ class WaveRNN(nn.Module): b_size, seq_len, _ = mels.size() - h1 = torch.zeros(b_size, self.rnn_dims).cuda() - h2 = torch.zeros(b_size, self.rnn_dims).cuda() - x = torch.zeros(b_size, 1).cuda() + if torch.cuda.is_available(): + h1 = torch.zeros(b_size, self.rnn_dims).cuda() + h2 = torch.zeros(b_size, self.rnn_dims).cuda() + x = torch.zeros(b_size, 1).cuda() + else: + h1 = torch.zeros(b_size, self.rnn_dims).cpu() + h2 = torch.zeros(b_size, self.rnn_dims).cpu() + x = torch.zeros(b_size, 1).cpu() d = self.aux_dims aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(4)] @@ -260,7 +268,10 @@ class WaveRNN(nn.Module): # i.e., it won't generalise to other shapes/dims b, t, c = x.size() total = t + 2 * pad if side == 'both' else t + pad - padded = torch.zeros(b, total, c).cuda() + if torch.cuda.is_available(): + padded = torch.zeros(b, total, c).cuda() + else: + padded = torch.zeros(b, total, c).cpu() if side == 'before' or side == 'both': padded[:, pad:pad + t, :] = x elif side == 'after': @@ -306,7 +317,10 @@ class WaveRNN(nn.Module): padding = target + 2 * overlap - remaining x = self.pad_tensor(x, padding, side='after') - folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda() + if torch.cuda.is_available(): + folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda() + else: + folded = torch.zeros(num_folds, target + 2 * overlap, features).cpu() # Get the values for the folded tensor for i in range(num_folds): -- GitLab