......@@ -5,6 +5,7 @@ from encoder import inference as encoder
from vocoder import inference as vocoder
from pathlib import Path
import numpy as np
import soundfile as sf
import librosa
import argparse
import torch
......@@ -30,6 +31,7 @@ if __name__ == '__main__':
"overhead but allows to save some GPU memory for lower-end GPUs.")
parser.add_argument("--no_sound", action="store_true", help=\
"If True, audio won't be played.")
parser.add_argument("--cpu", help="Use CPU.", action="store_true")
args = parser.parse_args()
print_args(args, parser)
if not args.no_sound:
......@@ -38,22 +40,25 @@ if __name__ == '__main__':
## Print some environment information (for debugging purposes)
print("Running a test of your configuration...\n")
if not torch.cuda.is_available():
print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
if args.cpu:
print("Using CPU for inference.")
elif torch.cuda.is_available():
device_id = torch.cuda.current_device()
gpu_properties = torch.cuda.get_device_properties(device_id)
print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
"%.1fGb total memory.\n" %
gpu_properties.total_memory / 1e9))
print("Your PyTorch installation is not configured. If you have a GPU ready "
"for deep learning, ensure that the drivers are properly installed, and that your "
"CUDA version matches your PyTorch installation. CPU-only inference is currently "
"not supported.", file=sys.stderr)
"CUDA version matches your PyTorch installation.", file=sys.stderr)
print("\nIf you're trying to use a cpu, please use the option --cpu.", file=sys.stderr)
device_id = torch.cuda.current_device()
gpu_properties = torch.cuda.get_device_properties(device_id)
print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
"%.1fGb total memory.\n" %
gpu_properties.total_memory / 1e9))
## Load the models one by one.
......@@ -172,15 +177,13 @@ if __name__ == '__main__':
sd.play(generated_wav, synthesizer.sample_rate)
# Save it on the disk
fpath = "demo_output_%02d.wav" % num_generated
filename = "demo_output_%02d.wav" % num_generated
librosa.output.write_wav(fpath, generated_wav.astype(np.float32),
sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
num_generated += 1
print("\nSaved output as %s\n\n" % fpath)
print("\nSaved output as %s\n\n" % filename)
except Exception as e:
print("Caught exception: %s" % repr(e))
\ No newline at end of file
......@@ -30,7 +30,7 @@ def load_model(weights_fpath: Path, device=None):
elif isinstance(device, str):
_device = torch.device(device)
_model = SpeakerEncoder(_device, torch.device("cpu"))
checkpoint = torch.load(weights_fpath)
checkpoint = torch.load(weights_fpath, _device)
print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
......@@ -7,11 +7,12 @@ from pathlib import Path
import torch
def sync(device: torch.device):
# For correct profiling (cuda operations are async)
if device.type == "cuda":
def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
......@@ -122,4 +123,3 @@ def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int,
}, backup_fpath)
profiler.tick("Extras (visualizations, saving)")
\ No newline at end of file
# each portion of tensorflow is needed
# core package is for RNN, cpu and gpu are for specific system speed-ups
......@@ -70,22 +70,22 @@ class Feeder:
# Create placeholders for inputs and targets. Don"t specify batch size because we want
# to be able to feed different batch sizes at eval time.
self._placeholders = [
tf.placeholder(tf.int32, shape=(None, None), name="inputs"),
tf.placeholder(tf.int32, shape=(None, ), name="input_lengths"),
tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels),
tf.compat.v1.placeholder(tf.int32, shape=(None, None), name="inputs"),
tf.compat.v1.placeholder(tf.int32, shape=(None, ), name="input_lengths"),
tf.compat.v1.placeholder(tf.float32, shape=(None, None, hparams.num_mels),
tf.placeholder(tf.float32, shape=(None, None), name="token_targets"),
tf.placeholder(tf.int32, shape=(None, ), name="targets_lengths"),
tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None),
tf.compat.v1.placeholder(tf.float32, shape=(None, None), name="token_targets"),
tf.compat.v1.placeholder(tf.int32, shape=(None, ), name="targets_lengths"),
tf.compat.v1.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None),
tf.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size),
tf.compat.v1.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size),
# Create queue for buffering data
queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32,
queue = tf.queue.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32,
tf.int32, tf.int32, tf.float32], name="input_queue")
self._enqueue_op = queue.enqueue(self._placeholders)
self.inputs, self.input_lengths, self.mel_targets, self.token_targets, \
......@@ -100,7 +100,7 @@ class Feeder:
# Create eval queue for buffering eval data
eval_queue = tf.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32,
eval_queue = tf.queue.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32,
tf.int32, tf.int32, tf.float32], name="eval_queue")
self._eval_enqueue_op = eval_queue.enqueue(self._placeholders)
self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, \
......@@ -54,7 +54,7 @@ class Synthesizer:
if self._low_mem:
raise Exception("Cannot load the synthesizer permanently in low mem mode")
self._model = Tacotron2(self.checkpoint_fpath, hparams)
def synthesize_spectrograms(self, texts: List[str],
......@@ -88,7 +88,7 @@ class Synthesizer:
def _one_shot_synthesize_spectrograms(checkpoint_fpath, embeddings, texts):
# Load the model and forward the inputs
model = Tacotron2(checkpoint_fpath, hparams)
specs, alignments = model.my_synthesize(embeddings, texts)
......@@ -134,4 +134,3 @@ class Synthesizer:
with the same parameters present in hparams.py.
return audio.inv_mel_spectrogram(mel, hparams)
\ No newline at end of file
......@@ -60,10 +60,10 @@ def _location_sensitive_score(W_query, W_fil, W_keys):
dtype = W_query.dtype
num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1]
v_a = tf.get_variable(
v_a = tf.compat.v1.get_variable(
"attention_variable_projection", shape=[num_units], dtype=dtype,
b_a = tf.get_variable(
b_a = tf.compat.v1.get_variable(
"attention_bias", shape=[num_units], dtype=dtype,
......@@ -155,10 +155,10 @@ class LocationSensitiveAttention(BahdanauAttention):
self.location_convolution = tf.layers.Conv1D(filters=hparams.attention_filters,
self.location_convolution = tf.compat.v1.layers.Conv1D(filters=hparams.attention_filters,
kernel_size=hparams.attention_kernel, padding="same", use_bias=True,
bias_initializer=tf.zeros_initializer(), name="location_features_convolution")
self.location_layer = tf.layers.Dense(units=num_units, use_bias=False,
self.location_layer = tf.compat.v1.layers.Dense(units=num_units, use_bias=False,
dtype=tf.float32, name="location_features_layer")
self._cumulate = cumulate_weights
......@@ -119,7 +119,7 @@ class TacoTrainingHelper(Helper):
#Pick previous outputs randomly with respect to teacher forcing ratio
next_inputs = tf.cond(
tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio),
tf.less(tf.random.uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio),
lambda: self._targets[:, time, :], #Teacher-forcing: return true frame
lambda: outputs[:,-self._output_dim:])
import tensorflow as tf
import torch
class HighwayNet:
......@@ -6,12 +7,12 @@ class HighwayNet:
self.units = units
self.scope = "HighwayNet" if name is None else name
self.H_layer = tf.layers.Dense(units=self.units, activation=tf.nn.relu, name="H")
self.T_layer = tf.layers.Dense(units=self.units, activation=tf.nn.sigmoid, name="T",
self.H_layer = tf.compat.v1.layers.Dense(units=self.units, activation=tf.nn.relu, name="H")
self.T_layer = tf.compat.v1.layers.Dense(units=self.units, activation=tf.nn.sigmoid, name="T",
def __call__(self, inputs):
with tf.variable_scope(self.scope):
with tf.compat.v1.variable_scope(self.scope):
H = self.H_layer(inputs)
T = self.T_layer(inputs)
return H * T + inputs * (1. - T)
......@@ -38,8 +39,8 @@ class CBHG:
self._bw_cell = tf.nn.rnn_cell.GRUCell(rnn_units, name="{}_backward_RNN".format(self.scope))
def __call__(self, inputs, input_lengths):
with tf.variable_scope(self.scope):
with tf.variable_scope("conv_bank"):
with tf.compat.v1.variable_scope(self.scope):
with tf.compat.v1.variable_scope("conv_bank"):
# Convolution bank: concatenate on the last axis to stack channels from all
# convolutions
# The convolution bank uses multiple different kernel sizes to have many insights
......@@ -71,7 +72,7 @@ class CBHG:
# Additional projection in case of dimension mismatch (for HighwayNet "residual"
# connection)
if highway_input.shape[2] != self.highway_units:
highway_input = tf.layers.dense(highway_input, self.highway_units)
highway_input = tf.compat.v1.layers.Dense(highway_input, self.highway_units)
# 4-layer HighwayNet
for highwaynet in self.highwaynet_layers:
......@@ -88,7 +89,7 @@ class CBHG:
return tf.concat(outputs, axis=2) # Concat forward and backward outputs
class ZoneoutLSTMCell(tf.nn.rnn_cell.RNNCell):
class ZoneoutLSTMCell(tf.compat.v1.nn.rnn_cell.RNNCell):
"""Wrapper for tf LSTM to create Zoneout LSTM Cell
inspired by:
......@@ -108,8 +109,11 @@ class ZoneoutLSTMCell(tf.nn.rnn_cell.RNNCell):
if zm < 0. or zs > 1.:
raise ValueError("One/both provided Zoneout factors are not in [0, 1]")
self._cell = tf.nn.rnn_cell.LSTMCell(num_units, state_is_tuple=state_is_tuple, name=name)
if torch.cuda.is_available():
self._cell = tf.contrib.cudnn_rnn.CudnnLSTM(num_units, name=name)
self._cell = tf.contrib.rnn.LSTMBlockCell(num_units, name=name)
self._zoneout_cell = zoneout_factor_cell
self._zoneout_outputs = zoneout_factor_output
self.is_training = is_training
......@@ -144,16 +148,13 @@ class ZoneoutLSTMCell(tf.nn.rnn_cell.RNNCell):
if self.is_training:
# nn.dropout takes keep_prob (probability to keep activations) not drop_prob (
# probability to mask activations)!
c = (1 - self._zoneout_cell) * tf.nn.dropout(new_c - prev_c,
(1 - self._zoneout_cell)) + prev_c
h = (1 - self._zoneout_outputs) * tf.nn.dropout(new_h - prev_h,
(1 - self._zoneout_outputs)) + prev_h
c = (1 - self._zoneout_cell) * tf.nn.dropout(new_c - prev_c, (1 - self._zoneout_cell)) + prev_c
h = (1 - self._zoneout_outputs) * tf.nn.dropout(new_h - prev_h, (1 - self._zoneout_outputs)) + prev_h
c = (1 - self._zoneout_cell) * new_c + self._zoneout_cell * prev_c
h = (1 - self._zoneout_outputs) * new_h + self._zoneout_outputs * prev_h
new_state = tf.nn.rnn_cell.LSTMStateTuple(c, h) if self.state_is_tuple else tf.concat(1, [c,
new_state = tf.compat.v1.nn.rnn_cell.LSTMStateTuple(c, h) if self.state_is_tuple else tf.concat(1, [c,
return output, new_state
......@@ -175,7 +176,7 @@ class EncoderConvolutions:
super(EncoderConvolutions, self).__init__()
self.is_training = is_training
self.kernel_size = hparams.enc_conv_kernel_size
self.channels = hparams.enc_conv_channels
self.activation = activation
......@@ -184,7 +185,7 @@ class EncoderConvolutions:
self.enc_conv_num_layers = hparams.enc_conv_num_layers
def __call__(self, inputs):
with tf.variable_scope(self.scope):
with tf.compat.v1.variable_scope(self.scope):
x = inputs
for i in range(self.enc_conv_num_layers):
x = conv1d(x, self.kernel_size, self.channels, self.activation,
......@@ -226,8 +227,8 @@ class EncoderRNN:
def __call__(self, inputs, input_lengths):
with tf.variable_scope(self.scope):
outputs, (fw_state, bw_state) = tf.nn.bidirectional_dynamic_rnn(
with tf.compat.v1.variable_scope(self.scope):
outputs, (fw_state, bw_state) = tf.compat.v1.nn.bidirectional_dynamic_rnn(
......@@ -239,7 +240,8 @@ class EncoderRNN:
class Prenet:
"""Two fully connected layers used as an information bottleneck for the attention.
Two fully connected layers used as an information bottleneck for the attention.
def __init__(self, is_training, layers_sizes=[256, 256], drop_rate=0.5, activation=tf.nn.relu,
......@@ -263,13 +265,13 @@ class Prenet:
def __call__(self, inputs):
x = inputs
with tf.variable_scope(self.scope):
with tf.compat.v1.variable_scope(self.scope):
for i, size in enumerate(self.layers_sizes):
dense = tf.layers.dense(x, units=size, activation=self.activation,
dense = tf.compat.v1.layers.dense(x, units=size, activation=self.activation,
name="dense_{}".format(i + 1))
# The paper discussed introducing diversity in generation at inference time
# by using a dropout of 0.5 only in prenet layers (in both training and inference).
x = tf.layers.dropout(dense, rate=self.drop_rate, training=True,
x = tf.compat.v1.layers.dropout(dense, rate=self.drop_rate, training=True,
name="dropout_{}".format(i + 1) + self.scope)
return x
......@@ -302,10 +304,10 @@ class DecoderRNN:
name="decoder_LSTM_{}".format(i + 1)) for i in
self._cell = tf.contrib.rnn.MultiRNNCell(self.rnn_layers, state_is_tuple=True)
self._cell = tf.compat.v1.nn.rnn_cell.MultiRNNCell(self.rnn_layers, state_is_tuple=True)
def __call__(self, inputs, states):
with tf.variable_scope(self.scope):
with tf.compat.v1.variable_scope(self.scope):
return self._cell(inputs, states)
......@@ -327,14 +329,14 @@ class FrameProjection:
self.activation = activation
self.scope = "Linear_projection" if scope is None else scope
self.dense = tf.layers.Dense(units=shape, activation=activation,
self.dense = tf.compat.v1.layers.Dense(units=shape, activation=activation,
def __call__(self, inputs):
with tf.variable_scope(self.scope):
with tf.compat.v1.variable_scope(self.scope):
# If activation==None, this returns a simple Linear projection
# else the projection will be passed through an activation function
# output = tf.layers.dense(inputs, units=self.shape, activation=self.activation,
# output = tf.compat.v1.layers.Dense(inputs, units=self.shape, activation=self.activation,
# name="projection_{}".format(self.scope))
output = self.dense(inputs)
......@@ -362,7 +364,7 @@ class StopProjection:
self.scope = "stop_token_projection" if scope is None else scope
def __call__(self, inputs):
with tf.variable_scope(self.scope):
with tf.compat.v1.variable_scope(self.scope):
output = tf.layers.dense(inputs, units=self.shape,
activation=None, name="projection_{}".format(self.scope))
......@@ -399,7 +401,7 @@ class Postnet:
self.drop_rate = hparams.tacotron_dropout_rate
def __call__(self, inputs):
with tf.variable_scope(self.scope):
with tf.compat.v1.variable_scope(self.scope):
x = inputs
for i in range(self.postnet_num_layers - 1):
x = conv1d(x, self.kernel_size, self.channels, self.activation,
......@@ -412,16 +414,16 @@ class Postnet:
def conv1d(inputs, kernel_size, channels, activation, is_training, drop_rate, scope):
with tf.variable_scope(scope):
conv1d_output = tf.layers.conv1d(
with tf.compat.v1.variable_scope(scope):
conv1d_output = tf.compat.v1.layers.conv1d(
batched = tf.layers.batch_normalization(conv1d_output, training=is_training)
batched = tf.compat.v1.layers.batch_normalization(conv1d_output, training=is_training)
activated = activation(batched)
return tf.layers.dropout(activated, rate=drop_rate, training=is_training,
return tf.compat.v1.layers.dropout(activated, rate=drop_rate, training=is_training,
......@@ -83,11 +83,11 @@ class Tacotron():
p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int)
p_mel_targets = tf.py_func(split_func, [mel_targets, split_infos[:, 1]],
lout_float) if mel_targets is not None else mel_targets
p_stop_token_targets = tf.py_func(split_func, [stop_token_targets, split_infos[:, 2]],
lout_float) if stop_token_targets is not None else \
p_inputs = tf.numpy_function(split_func, [inputs, split_infos[:, 0]], lout_int)
p_mel_targets = tf.numpy_function(split_func, [mel_targets, split_infos[:, 1]],
lout_float) if mel_targets is not None else mel_targets
p_stop_token_targets = tf.numpy_function(split_func, [stop_token_targets, split_infos[:, 2]],
lout_float) if stop_token_targets is not None else \
tower_inputs = []
......@@ -120,9 +120,9 @@ class Tacotron():
gpus = ["/gpu:{}".format(i) for i in
range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)]
for i in range(hp.tacotron_num_gpus):
with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
with tf.variable_scope("inference") as scope:
with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
with tf.compat.v1.variable_scope("inference") as scope:
assert hp.tacotron_teacher_forcing_mode in ("constant", "scheduled")
if hp.tacotron_teacher_forcing_mode == "scheduled" and is_training:
assert global_step is not None
......@@ -132,7 +132,7 @@ class Tacotron():
post_condition = hp.predict_linear and not gta
# Embeddings ==> [batch_size, sequence_length, embedding_dim]
self.embedding_table = tf.get_variable(
self.embedding_table = tf.compat.v1.get_variable(
"inputs_embedding", [len(symbols), hp.embedding_dim], dtype=tf.float32)
embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, tower_inputs[i])
......@@ -283,7 +283,7 @@ class Tacotron():
self.tower_targets_lengths = tower_targets_lengths
self.tower_stop_token_targets = tower_stop_token_targets
self.all_vars = tf.trainable_variables()
self.all_vars = tf.compat.v1.trainable_variables()
log("Initialized Tacotron model. Dimensions (? = dynamic shape): ")
log(" Train mode: {}".format(is_training))
......@@ -331,9 +331,9 @@ class Tacotron():
range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)]
for i in range(hp.tacotron_num_gpus):
with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
with tf.variable_scope("loss") as scope:
with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
with tf.compat.v1.variable_scope("loss") as scope:
if hp.mask_decoder:
# Compute loss of predictions before postnet
before = MaskedMSE(self.tower_mel_targets[i], self.tower_decoder_output[i],
......@@ -356,11 +356,11 @@ class Tacotron():
linear_loss = 0.
# Compute loss of predictions before postnet
before = tf.losses.mean_squared_error(self.tower_mel_targets[i],
before = tf.compat.v1.losses.mean_squared_error(self.tower_mel_targets[i],
# Compute loss after postnet
after = tf.losses.mean_squared_error(self.tower_mel_targets[i],
after = tf.compat.v1.losses.mean_squared_error(self.tower_mel_targets[i],
# Compute <stop_token> loss (for learning dynamic generation stop)
stop_token_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
......@@ -439,7 +439,7 @@ class Tacotron():
grad_device = "/cpu:0" if hp.tacotron_num_gpus > 1 else gpus[0]
with tf.device(grad_device):
with tf.variable_scope("optimizer") as scope:
with tf.compat.v1.variable_scope("optimizer") as scope:
if hp.tacotron_decay_learning_rate:
self.decay_steps = hp.tacotron_decay_steps
self.decay_rate = hp.tacotron_decay_rate
......@@ -448,16 +448,16 @@ class Tacotron():
self.learning_rate = tf.convert_to_tensor(hp.tacotron_initial_learning_rate)
optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.tacotron_adam_beta1,
hp.tacotron_adam_beta2, hp.tacotron_adam_epsilon)
optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate, hp.tacotron_adam_beta1,
hp.tacotron_adam_beta2, hp.tacotron_adam_epsilon)
# 2. Compute Gradient
for i in range(hp.tacotron_num_gpus):
# Device placement
with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
# agg_loss += self.tower_loss[i]
with tf.variable_scope("optimizer") as scope:
with tf.compat.v1.variable_scope("optimizer") as scope:
gradients = optimizer.compute_gradients(self.tower_loss[i])
......@@ -490,7 +490,7 @@ class Tacotron():
# Add dependency on UPDATE_OPS; otherwise batchnorm won"t work correctly. See:
# https://github.com/tensorflow/tensorflow/issues/1122
with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
with tf.control_dependencies(tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)):
self.optimize = optimizer.apply_gradients(zip(clipped_gradients, vars),
......@@ -510,12 +510,12 @@ class Tacotron():
hp = self._hparams
# Compute natural exponential decay
lr = tf.train.exponential_decay(init_lr,
global_step - hp.tacotron_start_decay,
# lr = 1e-3 at step 50k
self.decay_rate, # lr = 1e-5 around step 310k
lr = tf.compat.v1.train.exponential_decay(init_lr,
global_step - hp.tacotron_start_decay,
# lr = 1e-3 at step 50k
self.decay_rate, # lr = 1e-5 around step 310k
# clip learning rate by max and min values (initial and final values)
return tf.minimum(tf.maximum(lr, hp.tacotron_final_learning_rate), init_lr)
......@@ -12,13 +12,13 @@ class Tacotron2:
def __init__(self, checkpoint_path, hparams, gta=False, model_name="Tacotron"):
log("Constructing model: %s" % model_name)
#Force the batch size to be known in order to use attention masking in batch synthesis
inputs = tf.placeholder(tf.int32, (None, None), name="inputs")
input_lengths = tf.placeholder(tf.int32, (None,), name="input_lengths")
speaker_embeddings = tf.placeholder(tf.float32, (None, hparams.speaker_embedding_size),
inputs = tf.compat.v1.placeholder(tf.int32, (None, None), name="inputs")
input_lengths = tf.compat.v1.placeholder(tf.int32, (None,), name="input_lengths")
speaker_embeddings = tf.compat.v1.placeholder(tf.float32, (None, hparams.speaker_embedding_size),
targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name="mel_targets")
split_infos = tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos")
with tf.variable_scope("Tacotron_model") as scope:
targets = tf.compat.v1.placeholder(tf.float32, (None, None, hparams.num_mels), name="mel_targets")
split_infos = tf.compat.v1.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos")
with tf.compat.v1.variable_scope("Tacotron_model") as scope:
self.model = create_model(model_name, hparams)
if gta:
self.model.initialize(inputs, input_lengths, speaker_embeddings, targets, gta=gta,
......@@ -52,14 +52,14 @@ class Tacotron2:
log("Loading checkpoint: %s" % checkpoint_path)
#Memory allocation on the GPUs as needed
config = tf.ConfigProto()
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
config.allow_soft_placement = True
self.session = tf.Session(config=config)
self.session = tf.compat.v1.Session(config=config)
saver = tf.train.Saver()
saver = tf.compat.v1.train.Saver()
saver.restore(self.session, checkpoint_path)
def my_synthesize(self, speaker_embeds, texts):
......@@ -33,48 +33,48 @@ def add_embedding_stats(summary_writer, embedding_names, paths_to_meta, checkpoi
def add_train_stats(model, hparams):
with tf.variable_scope("stats") as scope:
with tf.compat.v1.variable_scope("stats") as scope:
for i in range(hparams.tacotron_num_gpus):
tf.summary.histogram("mel_outputs %d" % i, model.tower_mel_outputs[i])
tf.summary.histogram("mel_targets %d" % i, model.tower_mel_targets[i])
tf.summary.scalar("before_loss", model.before_loss)
tf.summary.scalar("after_loss", model.after_loss)
tf.compat.v1.summary.histogram("mel_outputs %d" % i, model.tower_mel_outputs[i])
tf.compat.v1.summary.histogram("mel_targets %d" % i, model.tower_mel_targets[i])
tf.compat.v1.summary.scalar("before_loss", model.before_loss)
tf.compat.v1.summary.scalar("after_loss", model.after_loss)
if hparams.predict_linear:
tf.summary.scalar("linear_loss", model.linear_loss)
tf.compat.v1.summary.scalar("linear_loss", model.linear_loss)
for i in range(hparams.tacotron_num_gpus):
tf.summary.histogram("mel_outputs %d" % i, model.tower_linear_outputs[i])
tf.summary.histogram("mel_targets %d" % i, model.tower_linear_targets[i])
tf.compat.v1.summary.histogram("mel_outputs %d" % i, model.tower_linear_outputs[i])
tf.compat.v1.summary.histogram("mel_targets %d" % i, model.tower_linear_targets[i])
tf.summary.scalar("regularization_loss", model.regularization_loss)
tf.summary.scalar("stop_token_loss", model.stop_token_loss)
tf.summary.scalar("loss", model.loss)
tf.summary.scalar("learning_rate", model.learning_rate) # Control learning rate decay speed
tf.compat.v1.summary.scalar("regularization_loss", model.regularization_loss)
tf.compat.v1.summary.scalar("stop_token_loss", model.stop_token_loss)
tf.compat.v1.summary.scalar("loss", model.loss)
tf.compat.v1.summary.scalar("learning_rate", model.learning_rate) # Control learning rate decay speed
if hparams.tacotron_teacher_forcing_mode == "scheduled":
tf.summary.scalar("teacher_forcing_ratio", model.ratio) # Control teacher forcing
tf.compat.v1.summary.scalar("teacher_forcing_ratio", model.ratio) # Control teacher forcing
# ratio decay when mode = "scheduled"
gradient_norms = [tf.norm(grad) for grad in model.gradients]
tf.summary.histogram("gradient_norm", gradient_norms)
tf.summary.scalar("max_gradient_norm", tf.reduce_max(gradient_norms)) # visualize
gradient_norms = [tf.norm(tensor=grad) for grad in model.gradients]
tf.compat.v1.summary.histogram("gradient_norm", gradient_norms)
tf.compat.v1.summary.scalar("max_gradient_norm", tf.reduce_max(input_tensor=gradient_norms)) # visualize
# gradients (in case of explosion)
return tf.summary.merge_all()
return tf.compat.v1.summary.merge_all()
def add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss,
values = [
tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_loss", simple_value=loss),
tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_loss", simple_value=loss),
if linear_loss is not None:
test_summary = tf.Summary(value=values)
test_summary = tf.compat.v1.Summary(value=values)
summary_writer.add_summary(test_summary, step)
......@@ -83,7 +83,7 @@ def time_string():
def model_train_mode(args, feeder, hparams, global_step):
with tf.variable_scope("Tacotron_model", reuse=tf.AUTO_REUSE) as scope:
with tf.compat.v1.variable_scope("Tacotron_model", reuse=tf.compat.v1.AUTO_REUSE) as scope:
model = create_model("Tacotron", hparams)
model.initialize(feeder.inputs, feeder.input_lengths, feeder.speaker_embeddings,
feeder.mel_targets, feeder.token_targets,
......@@ -96,7 +96,7 @@ def model_train_mode(args, feeder, hparams, global_step):
def model_test_mode(args, feeder, hparams, global_step):
with tf.variable_scope("Tacotron_model", reuse=tf.AUTO_REUSE) as scope:
with tf.compat.v1.variable_scope("Tacotron_model", reuse=tf.compat.v1.AUTO_REUSE) as scope:
model = create_model("Tacotron", hparams)
model.initialize(feeder.eval_inputs, feeder.eval_input_lengths,
feeder.eval_speaker_embeddings, feeder.eval_mel_targets,
......@@ -136,11 +136,11 @@ def train(log_dir, args, hparams):
# Start by setting a seed for repeatability
# Set up data feeder
coord = tf.train.Coordinator()
with tf.variable_scope("datafeeder") as scope:
with tf.compat.v1.variable_scope("datafeeder") as scope:
feeder = Feeder(coord, metadat_fpath, hparams)
# Set up model:
......@@ -164,21 +164,21 @@ def train(log_dir, args, hparams):
step = 0
time_window = ValueWindow(100)
loss_window = ValueWindow(100)
saver = tf.train.Saver(max_to_keep=5)
saver = tf.compat.v1.train.Saver(max_to_keep=5)
log("Tacotron training set to a maximum of {} steps".format(args.tacotron_train_steps))
# Memory allocation on the GPU as needed
config = tf.ConfigProto()
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
config.allow_soft_placement = True
# Train
with tf.Session(config=config) as sess:
with tf.compat.v1.Session(config=config) as sess:
summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)
summary_writer = tf.compat.v1.summary.FileWriter(tensorboard_dir, sess.graph)
# saved model restoring
if args.restore:
......@@ -6,7 +6,7 @@ import torch
_model = None # type: WaveRNN
def load_model(weights_fpath, verbose=True):
global _model
global _model, _device
if verbose:
print("Building Wave-RNN")
......@@ -23,11 +23,17 @@ def load_model(weights_fpath, verbose=True):
if torch.cuda.is_available():
_model = _model.cuda()
_device = torch.device('cuda')
_device = torch.device('cpu')
if verbose:
print("Loading model weights at %s" % weights_fpath)
checkpoint = torch.load(weights_fpath)
checkpoint = torch.load(weights_fpath, _device)
......@@ -157,7 +157,10 @@ class WaveRNN(nn.Module):
rnn2 = self.get_gru_cell(self.rnn2)
with torch.no_grad():
mels = mels.cuda()
if torch.cuda.is_available():
mels = mels.cuda()
mels = mels.cpu()
wave_len = (mels.size(-1) - 1) * self.hop_length
mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side='both')
mels, aux = self.upsample(mels.transpose(1, 2))
......@@ -168,9 +171,14 @@ class WaveRNN(nn.Module):
b_size, seq_len, _ = mels.size()
h1 = torch.zeros(b_size, self.rnn_dims).cuda()
h2 = torch.zeros(b_size, self.rnn_dims).cuda()
x = torch.zeros(b_size, 1).cuda()
if torch.cuda.is_available():
h1 = torch.zeros(b_size, self.rnn_dims).cuda()
h2 = torch.zeros(b_size, self.rnn_dims).cuda()
x = torch.zeros(b_size, 1).cuda()
h1 = torch.zeros(b_size, self.rnn_dims).cpu()
h2 = torch.zeros(b_size, self.rnn_dims).cpu()
x = torch.zeros(b_size, 1).cpu()
d = self.aux_dims
aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(4)]
......@@ -260,7 +268,10 @@ class WaveRNN(nn.Module):
# i.e., it won't generalise to other shapes/dims
b, t, c = x.size()
total = t + 2 * pad if side == 'both' else t + pad
padded = torch.zeros(b, total, c).cuda()
if torch.cuda.is_available():
padded = torch.zeros(b, total, c).cuda()
padded = torch.zeros(b, total, c).cpu()
if side == 'before' or side == 'both':
padded[:, pad:pad + t, :] = x
elif side == 'after':
......@@ -306,7 +317,10 @@ class WaveRNN(nn.Module):
padding = target + 2 * overlap - remaining
x = self.pad_tensor(x, padding, side='after')
folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda()
if torch.cuda.is_available():
folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda()
folded = torch.zeros(num_folds, target + 2 * overlap, features).cpu()
# Get the values for the folded tensor
for i in range(num_folds):
