From 1b8d2e794b32039aa7ecc6367dabb64a3e5e6467 Mon Sep 17 00:00:00 2001
From: blue-fish <67130644+blue-fish@users.noreply.github.com>
Date: Mon, 22 Jun 2020 09:44:36 -0700
Subject: [PATCH] Cleanup pr 331 (#366)

[#366] Add CPU support. Also some updates for tensorflow v2 compatibility (in work)
Co-authored-by: pusalieth <pusalieth@users.noreply.github.com>
---
 demo_cli.py                        | 41 +++++++++--------
 encoder/inference.py               |  2 +-
 encoder/train.py                   |  6 +--
 requirements.txt                   |  6 ++-
 synthesizer/feeder.py              | 18 ++++----
 synthesizer/inference.py           |  5 +--
 synthesizer/models/attention.py    |  8 ++--
 synthesizer/models/helpers.py      |  2 +-
 synthesizer/models/modules.py      | 70 +++++++++++++++--------------
 synthesizer/models/tacotron.py     | 60 ++++++++++++-------------
 synthesizer/tacotron2.py           | 20 ++++-----
 synthesizer/train.py               | 72 +++++++++++++++---------------
 vocoder/inference.py               | 12 +++--
 vocoder/models/fatchord_version.py | 26 ++++++++---
 14 files changed, 188 insertions(+), 160 deletions(-)

diff --git a/demo_cli.py b/demo_cli.py
index 57bb001..1fb2df4 100644
--- a/demo_cli.py
+++ b/demo_cli.py
@@ -5,6 +5,7 @@ from encoder import inference as encoder
 from vocoder import inference as vocoder
 from pathlib import Path
 import numpy as np
+import soundfile as sf
 import librosa
 import argparse
 import torch
@@ -30,6 +31,7 @@ if __name__ == '__main__':
         "overhead but allows to save some GPU memory for lower-end GPUs.")
     parser.add_argument("--no_sound", action="store_true", help=\
         "If True, audio won't be played.")
+    parser.add_argument("--cpu", help="Use CPU.", action="store_true")
     args = parser.parse_args()
     print_args(args, parser)
     if not args.no_sound:
@@ -38,22 +40,25 @@ if __name__ == '__main__':
     
     ## Print some environment information (for debugging purposes)
     print("Running a test of your configuration...\n")
-    if not torch.cuda.is_available():
-        print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
+    if args.cpu:
+        print("Using CPU for inference.")
+    elif torch.cuda.is_available():
+        device_id = torch.cuda.current_device()
+        gpu_properties = torch.cuda.get_device_properties(device_id)
+        print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
+            "%.1fGb total memory.\n" % 
+            (torch.cuda.device_count(),
+            device_id,
+            gpu_properties.name,
+            gpu_properties.major,
+            gpu_properties.minor,
+            gpu_properties.total_memory / 1e9))
+    else:
+        print("Your PyTorch installation is not configured. If you have a GPU ready "
               "for deep learning, ensure that the drivers are properly installed, and that your "
-              "CUDA version matches your PyTorch installation. CPU-only inference is currently "
-              "not supported.", file=sys.stderr)
+              "CUDA version matches your PyTorch installation.", file=sys.stderr)
+        print("\nIf you're trying to use a cpu, please use the option --cpu.", file=sys.stderr)
         quit(-1)
-    device_id = torch.cuda.current_device()
-    gpu_properties = torch.cuda.get_device_properties(device_id)
-    print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
-          "%.1fGb total memory.\n" % 
-          (torch.cuda.device_count(),
-           device_id,
-           gpu_properties.name,
-           gpu_properties.major,
-           gpu_properties.minor,
-           gpu_properties.total_memory / 1e9))
     
     
     ## Load the models one by one.
@@ -172,15 +177,13 @@ if __name__ == '__main__':
                 sd.play(generated_wav, synthesizer.sample_rate)
                 
             # Save it on the disk
-            fpath = "demo_output_%02d.wav" % num_generated
+            filename = "demo_output_%02d.wav" % num_generated
             print(generated_wav.dtype)
-            librosa.output.write_wav(fpath, generated_wav.astype(np.float32), 
-                                     synthesizer.sample_rate)
+            sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
             num_generated += 1
-            print("\nSaved output as %s\n\n" % fpath)
+            print("\nSaved output as %s\n\n" % filename)
             
             
         except Exception as e:
             print("Caught exception: %s" % repr(e))
             print("Restarting\n")
-        
\ No newline at end of file
diff --git a/encoder/inference.py b/encoder/inference.py
index 2447832..d769dd1 100644
--- a/encoder/inference.py
+++ b/encoder/inference.py
@@ -30,7 +30,7 @@ def load_model(weights_fpath: Path, device=None):
     elif isinstance(device, str):
         _device = torch.device(device)
     _model = SpeakerEncoder(_device, torch.device("cpu"))
-    checkpoint = torch.load(weights_fpath)
+    checkpoint = torch.load(weights_fpath, _device)
     _model.load_state_dict(checkpoint["model_state"])
     _model.eval()
     print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
diff --git a/encoder/train.py b/encoder/train.py
index 071af1b..ee65418 100644
--- a/encoder/train.py
+++ b/encoder/train.py
@@ -7,11 +7,12 @@ from pathlib import Path
 import torch
 
 def sync(device: torch.device):
-    # FIXME
-    return 
     # For correct profiling (cuda operations are async)
     if device.type == "cuda":
         torch.cuda.synchronize(device)
+    else:
+        torch.cpu.synchronize(device)
+    
 
 def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
           backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
@@ -122,4 +123,3 @@ def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int,
             }, backup_fpath)
             
         profiler.tick("Extras (visualizations, saving)")
-        
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 4b54673..c0fdf41 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,8 @@
-tensorflow-gpu>=1.10.0,<=1.14.0
+# each portion of tensorflow is needed
+# core package is for RNN, cpu and gpu are for specific system speed-ups
+tensorflow==1.15
+tensorflow-cpu==1.15
+tensorflow-gpu==1.15
 umap-learn
 visdom
 webrtcvad
diff --git a/synthesizer/feeder.py b/synthesizer/feeder.py
index 6fc1b20..b1acb3d 100644
--- a/synthesizer/feeder.py
+++ b/synthesizer/feeder.py
@@ -70,22 +70,22 @@ class Feeder:
 			# Create placeholders for inputs and targets. Don"t specify batch size because we want
 			# to be able to feed different batch sizes at eval time.
 			self._placeholders = [
-				tf.placeholder(tf.int32, shape=(None, None), name="inputs"),
-				tf.placeholder(tf.int32, shape=(None, ), name="input_lengths"),
-				tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), 
+				tf.compat.v1.placeholder(tf.int32, shape=(None, None), name="inputs"),
+				tf.compat.v1.placeholder(tf.int32, shape=(None, ), name="input_lengths"),
+				tf.compat.v1.placeholder(tf.float32, shape=(None, None, hparams.num_mels), 
 							   name="mel_targets"),
-				tf.placeholder(tf.float32, shape=(None, None), name="token_targets"),
-				tf.placeholder(tf.int32, shape=(None, ), name="targets_lengths"),
-				tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), 
+				tf.compat.v1.placeholder(tf.float32, shape=(None, None), name="token_targets"),
+				tf.compat.v1.placeholder(tf.int32, shape=(None, ), name="targets_lengths"),
+				tf.compat.v1.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), 
 							   name="split_infos"),
 				
 				# SV2TTS
-				tf.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size), 
+				tf.compat.v1.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size), 
 							   name="speaker_embeddings")
 			]
 
 			# Create queue for buffering data
-			queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, 
+			queue = tf.queue.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, 
 									 tf.int32, tf.int32, tf.float32], name="input_queue")
 			self._enqueue_op = queue.enqueue(self._placeholders)
 			self.inputs, self.input_lengths, self.mel_targets, self.token_targets, \
@@ -100,7 +100,7 @@ class Feeder:
 			self.speaker_embeddings.set_shape(self._placeholders[6].shape)
 
 			# Create eval queue for buffering eval data
-			eval_queue = tf.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32,  
+			eval_queue = tf.queue.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32,  
 										  tf.int32, tf.int32, tf.float32], name="eval_queue")
 			self._eval_enqueue_op = eval_queue.enqueue(self._placeholders)
 			self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, \
diff --git a/synthesizer/inference.py b/synthesizer/inference.py
index 99fb778..86bd2fb 100644
--- a/synthesizer/inference.py
+++ b/synthesizer/inference.py
@@ -54,7 +54,7 @@ class Synthesizer:
         """
         if self._low_mem:
             raise Exception("Cannot load the synthesizer permanently in low mem mode")
-        tf.reset_default_graph()
+        tf.compat.v1.reset_default_graph()
         self._model = Tacotron2(self.checkpoint_fpath, hparams)
             
     def synthesize_spectrograms(self, texts: List[str],
@@ -88,7 +88,7 @@ class Synthesizer:
     @staticmethod
     def _one_shot_synthesize_spectrograms(checkpoint_fpath, embeddings, texts):
         # Load the model and forward the inputs
-        tf.reset_default_graph()
+        tf.compat.v1.reset_default_graph()
         model = Tacotron2(checkpoint_fpath, hparams)
         specs, alignments = model.my_synthesize(embeddings, texts)
         
@@ -134,4 +134,3 @@ class Synthesizer:
         with the same parameters present in hparams.py.
         """
         return audio.inv_mel_spectrogram(mel, hparams)
-    
\ No newline at end of file
diff --git a/synthesizer/models/attention.py b/synthesizer/models/attention.py
index 58892ad..1f40d45 100644
--- a/synthesizer/models/attention.py
+++ b/synthesizer/models/attention.py
@@ -60,10 +60,10 @@ def _location_sensitive_score(W_query, W_fil, W_keys):
 	dtype = W_query.dtype
 	num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1]
 
-	v_a = tf.get_variable(
+	v_a = tf.compat.v1.get_variable(
 		"attention_variable_projection", shape=[num_units], dtype=dtype,
 		initializer=tf.contrib.layers.xavier_initializer())
-	b_a = tf.get_variable(
+	b_a = tf.compat.v1.get_variable(
 		"attention_bias", shape=[num_units], dtype=dtype,
 		initializer=tf.zeros_initializer())
 
@@ -155,10 +155,10 @@ class LocationSensitiveAttention(BahdanauAttention):
 				probability_fn=normalization_function,
 				name=name)
 
-		self.location_convolution = tf.layers.Conv1D(filters=hparams.attention_filters,
+		self.location_convolution = tf.compat.v1.layers.Conv1D(filters=hparams.attention_filters,
 			kernel_size=hparams.attention_kernel, padding="same", use_bias=True,
 			bias_initializer=tf.zeros_initializer(), name="location_features_convolution")
-		self.location_layer = tf.layers.Dense(units=num_units, use_bias=False,
+		self.location_layer = tf.compat.v1.layers.Dense(units=num_units, use_bias=False,
 			dtype=tf.float32, name="location_features_layer")
 		self._cumulate = cumulate_weights
 
diff --git a/synthesizer/models/helpers.py b/synthesizer/models/helpers.py
index eec0699..4e58ccd 100644
--- a/synthesizer/models/helpers.py
+++ b/synthesizer/models/helpers.py
@@ -119,7 +119,7 @@ class TacoTrainingHelper(Helper):
 
 			#Pick previous outputs randomly with respect to teacher forcing ratio
 			next_inputs = tf.cond(
-				tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio),
+				tf.less(tf.random.uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio),
 				lambda: self._targets[:, time, :], #Teacher-forcing: return true frame
 				lambda: outputs[:,-self._output_dim:])
 
diff --git a/synthesizer/models/modules.py b/synthesizer/models/modules.py
index 7696572..f9fe7eb 100644
--- a/synthesizer/models/modules.py
+++ b/synthesizer/models/modules.py
@@ -1,4 +1,5 @@
 import tensorflow as tf
+import torch
 
 
 class HighwayNet:
@@ -6,12 +7,12 @@ class HighwayNet:
         self.units = units
         self.scope = "HighwayNet" if name is None else name
         
-        self.H_layer = tf.layers.Dense(units=self.units, activation=tf.nn.relu, name="H")
-        self.T_layer = tf.layers.Dense(units=self.units, activation=tf.nn.sigmoid, name="T",
+        self.H_layer = tf.compat.v1.layers.Dense(units=self.units, activation=tf.nn.relu, name="H")
+        self.T_layer = tf.compat.v1.layers.Dense(units=self.units, activation=tf.nn.sigmoid, name="T",
                                        bias_initializer=tf.constant_initializer(-1.))
     
     def __call__(self, inputs):
-        with tf.variable_scope(self.scope):
+        with tf.compat.v1.variable_scope(self.scope):
             H = self.H_layer(inputs)
             T = self.T_layer(inputs)
             return H * T + inputs * (1. - T)
@@ -38,8 +39,8 @@ class CBHG:
         self._bw_cell = tf.nn.rnn_cell.GRUCell(rnn_units, name="{}_backward_RNN".format(self.scope))
     
     def __call__(self, inputs, input_lengths):
-        with tf.variable_scope(self.scope):
-            with tf.variable_scope("conv_bank"):
+        with tf.compat.v1.variable_scope(self.scope):
+            with tf.compat.v1.variable_scope("conv_bank"):
                 # Convolution bank: concatenate on the last axis to stack channels from all 
                 # convolutions
                 # The convolution bank uses multiple different kernel sizes to have many insights 
@@ -71,7 +72,7 @@ class CBHG:
             # Additional projection in case of dimension mismatch (for HighwayNet "residual" 
 			# connection)
             if highway_input.shape[2] != self.highway_units:
-                highway_input = tf.layers.dense(highway_input, self.highway_units)
+                highway_input = tf.compat.v1.layers.Dense(highway_input, self.highway_units)
             
             # 4-layer HighwayNet
             for highwaynet in self.highwaynet_layers:
@@ -88,7 +89,7 @@ class CBHG:
             return tf.concat(outputs, axis=2)  # Concat forward and backward outputs
 
 
-class ZoneoutLSTMCell(tf.nn.rnn_cell.RNNCell):
+class ZoneoutLSTMCell(tf.compat.v1.nn.rnn_cell.RNNCell):
     """Wrapper for tf LSTM to create Zoneout LSTM Cell
 
     inspired by:
@@ -108,8 +109,11 @@ class ZoneoutLSTMCell(tf.nn.rnn_cell.RNNCell):
         
         if zm < 0. or zs > 1.:
             raise ValueError("One/both provided Zoneout factors are not in [0, 1]")
-        
-        self._cell = tf.nn.rnn_cell.LSTMCell(num_units, state_is_tuple=state_is_tuple, name=name)
+
+        if torch.cuda.is_available():
+            self._cell = tf.contrib.cudnn_rnn.CudnnLSTM(num_units, name=name)
+        else:
+            self._cell = tf.contrib.rnn.LSTMBlockCell(num_units, name=name)
         self._zoneout_cell = zoneout_factor_cell
         self._zoneout_outputs = zoneout_factor_output
         self.is_training = is_training
@@ -144,16 +148,13 @@ class ZoneoutLSTMCell(tf.nn.rnn_cell.RNNCell):
         if self.is_training:
             # nn.dropout takes keep_prob (probability to keep activations) not drop_prob (
 			# probability to mask activations)!
-            c = (1 - self._zoneout_cell) * tf.nn.dropout(new_c - prev_c,
-                                                         (1 - self._zoneout_cell)) + prev_c
-            h = (1 - self._zoneout_outputs) * tf.nn.dropout(new_h - prev_h,
-                                                            (1 - self._zoneout_outputs)) + prev_h
-        
+            c = (1 - self._zoneout_cell) * tf.nn.dropout(new_c - prev_c, (1 - self._zoneout_cell)) + prev_c
+            h = (1 - self._zoneout_outputs) * tf.nn.dropout(new_h - prev_h, (1 - self._zoneout_outputs)) + prev_h
         else:
             c = (1 - self._zoneout_cell) * new_c + self._zoneout_cell * prev_c
             h = (1 - self._zoneout_outputs) * new_h + self._zoneout_outputs * prev_h
         
-        new_state = tf.nn.rnn_cell.LSTMStateTuple(c, h) if self.state_is_tuple else tf.concat(1, [c,
+        new_state = tf.compat.v1.nn.rnn_cell.LSTMStateTuple(c, h) if self.state_is_tuple else tf.concat(1, [c,
                                                                                                   h])
         
         return output, new_state
@@ -175,7 +176,7 @@ class EncoderConvolutions:
         """
         super(EncoderConvolutions, self).__init__()
         self.is_training = is_training
-        
+
         self.kernel_size = hparams.enc_conv_kernel_size
         self.channels = hparams.enc_conv_channels
         self.activation = activation
@@ -184,7 +185,7 @@ class EncoderConvolutions:
         self.enc_conv_num_layers = hparams.enc_conv_num_layers
     
     def __call__(self, inputs):
-        with tf.variable_scope(self.scope):
+        with tf.compat.v1.variable_scope(self.scope):
             x = inputs
             for i in range(self.enc_conv_num_layers):
                 x = conv1d(x, self.kernel_size, self.channels, self.activation,
@@ -226,8 +227,8 @@ class EncoderRNN:
                                         name="encoder_bw_LSTM")
     
     def __call__(self, inputs, input_lengths):
-        with tf.variable_scope(self.scope):
-            outputs, (fw_state, bw_state) = tf.nn.bidirectional_dynamic_rnn(
+        with tf.compat.v1.variable_scope(self.scope):
+            outputs, (fw_state, bw_state) = tf.compat.v1.nn.bidirectional_dynamic_rnn(
                 self._fw_cell,
                 self._bw_cell,
                 inputs,
@@ -239,7 +240,8 @@ class EncoderRNN:
 
 
 class Prenet:
-    """Two fully connected layers used as an information bottleneck for the attention.
+    """
+        Two fully connected layers used as an information bottleneck for the attention.
     """
     
     def __init__(self, is_training, layers_sizes=[256, 256], drop_rate=0.5, activation=tf.nn.relu,
@@ -263,13 +265,13 @@ class Prenet:
     def __call__(self, inputs):
         x = inputs
         
-        with tf.variable_scope(self.scope):
+        with tf.compat.v1.variable_scope(self.scope):
             for i, size in enumerate(self.layers_sizes):
-                dense = tf.layers.dense(x, units=size, activation=self.activation,
+                dense = tf.compat.v1.layers.dense(x, units=size, activation=self.activation,
                                         name="dense_{}".format(i + 1))
                 # The paper discussed introducing diversity in generation at inference time
                 # by using a dropout of 0.5 only in prenet layers (in both training and inference).
-                x = tf.layers.dropout(dense, rate=self.drop_rate, training=True,
+                x = tf.compat.v1.layers.dropout(dense, rate=self.drop_rate, training=True,
                                       name="dropout_{}".format(i + 1) + self.scope)
         return x
 
@@ -302,10 +304,10 @@ class DecoderRNN:
                                            name="decoder_LSTM_{}".format(i + 1)) for i in
                            range(layers)]
         
-        self._cell = tf.contrib.rnn.MultiRNNCell(self.rnn_layers, state_is_tuple=True)
+        self._cell = tf.compat.v1.nn.rnn_cell.MultiRNNCell(self.rnn_layers, state_is_tuple=True)
     
     def __call__(self, inputs, states):
-        with tf.variable_scope(self.scope):
+        with tf.compat.v1.variable_scope(self.scope):
             return self._cell(inputs, states)
 
 
@@ -327,14 +329,14 @@ class FrameProjection:
         self.activation = activation
         
         self.scope = "Linear_projection" if scope is None else scope
-        self.dense = tf.layers.Dense(units=shape, activation=activation,
+        self.dense = tf.compat.v1.layers.Dense(units=shape, activation=activation,
                                      name="projection_{}".format(self.scope))
     
     def __call__(self, inputs):
-        with tf.variable_scope(self.scope):
+        with tf.compat.v1.variable_scope(self.scope):
             # If activation==None, this returns a simple Linear projection
             # else the projection will be passed through an activation function
-            # output = tf.layers.dense(inputs, units=self.shape, activation=self.activation,
+            # output = tf.compat.v1.layers.Dense(inputs, units=self.shape, activation=self.activation,
             # 	name="projection_{}".format(self.scope))
             output = self.dense(inputs)
             
@@ -362,7 +364,7 @@ class StopProjection:
         self.scope = "stop_token_projection" if scope is None else scope
     
     def __call__(self, inputs):
-        with tf.variable_scope(self.scope):
+        with tf.compat.v1.variable_scope(self.scope):
             output = tf.layers.dense(inputs, units=self.shape,
                                      activation=None, name="projection_{}".format(self.scope))
             
@@ -399,7 +401,7 @@ class Postnet:
         self.drop_rate = hparams.tacotron_dropout_rate
     
     def __call__(self, inputs):
-        with tf.variable_scope(self.scope):
+        with tf.compat.v1.variable_scope(self.scope):
             x = inputs
             for i in range(self.postnet_num_layers - 1):
                 x = conv1d(x, self.kernel_size, self.channels, self.activation,
@@ -412,16 +414,16 @@ class Postnet:
 
 
 def conv1d(inputs, kernel_size, channels, activation, is_training, drop_rate, scope):
-    with tf.variable_scope(scope):
-        conv1d_output = tf.layers.conv1d(
+    with tf.compat.v1.variable_scope(scope):
+        conv1d_output = tf.compat.v1.layers.conv1d(
             inputs,
             filters=channels,
             kernel_size=kernel_size,
             activation=None,
             padding="same")
-        batched = tf.layers.batch_normalization(conv1d_output, training=is_training)
+        batched = tf.compat.v1.layers.batch_normalization(conv1d_output, training=is_training)
         activated = activation(batched)
-        return tf.layers.dropout(activated, rate=drop_rate, training=is_training,
+        return tf.compat.v1.layers.dropout(activated, rate=drop_rate, training=is_training,
                                  name="dropout_{}".format(scope))
 
 
diff --git a/synthesizer/models/tacotron.py b/synthesizer/models/tacotron.py
index 9c4de4d..8cf2d3d 100644
--- a/synthesizer/models/tacotron.py
+++ b/synthesizer/models/tacotron.py
@@ -83,11 +83,11 @@ class Tacotron():
             
             ##############
             
-            p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int)
-            p_mel_targets = tf.py_func(split_func, [mel_targets, split_infos[:, 1]],
-                                       lout_float) if mel_targets is not None else mel_targets
-            p_stop_token_targets = tf.py_func(split_func, [stop_token_targets, split_infos[:, 2]],
-                                              lout_float) if stop_token_targets is not None else \
+            p_inputs = tf.numpy_function(split_func, [inputs, split_infos[:, 0]], lout_int)
+            p_mel_targets = tf.numpy_function(split_func, [mel_targets, split_infos[:, 1]],
+                                              lout_float) if mel_targets is not None else mel_targets
+            p_stop_token_targets = tf.numpy_function(split_func, [stop_token_targets, split_infos[:, 2]],
+                                                     lout_float) if stop_token_targets is not None else \
 				stop_token_targets
             
             tower_inputs = []
@@ -120,9 +120,9 @@ class Tacotron():
         gpus = ["/gpu:{}".format(i) for i in
                 range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)]
         for i in range(hp.tacotron_num_gpus):
-            with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
-                                                          worker_device=gpus[i])):
-                with tf.variable_scope("inference") as scope:
+            with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
+                                                                    worker_device=gpus[i])):
+                with tf.compat.v1.variable_scope("inference") as scope:
                     assert hp.tacotron_teacher_forcing_mode in ("constant", "scheduled")
                     if hp.tacotron_teacher_forcing_mode == "scheduled" and is_training:
                         assert global_step is not None
@@ -132,7 +132,7 @@ class Tacotron():
                     post_condition = hp.predict_linear and not gta
                     
                     # Embeddings ==> [batch_size, sequence_length, embedding_dim]
-                    self.embedding_table = tf.get_variable(
+                    self.embedding_table = tf.compat.v1.get_variable(
                         "inputs_embedding", [len(symbols), hp.embedding_dim], dtype=tf.float32)
                     embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, tower_inputs[i])
                     
@@ -283,7 +283,7 @@ class Tacotron():
         self.tower_targets_lengths = tower_targets_lengths
         self.tower_stop_token_targets = tower_stop_token_targets
         
-        self.all_vars = tf.trainable_variables()
+        self.all_vars = tf.compat.v1.trainable_variables()
         
         log("Initialized Tacotron model. Dimensions (? = dynamic shape): ")
         log("  Train mode:               {}".format(is_training))
@@ -331,9 +331,9 @@ class Tacotron():
                 range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)]
         
         for i in range(hp.tacotron_num_gpus):
-            with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
-                                                          worker_device=gpus[i])):
-                with tf.variable_scope("loss") as scope:
+            with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
+                                                                    worker_device=gpus[i])):
+                with tf.compat.v1.variable_scope("loss") as scope:
                     if hp.mask_decoder:
                         # Compute loss of predictions before postnet
                         before = MaskedMSE(self.tower_mel_targets[i], self.tower_decoder_output[i],
@@ -356,11 +356,11 @@ class Tacotron():
                         linear_loss = 0.
                     else:
                         # Compute loss of predictions before postnet
-                        before = tf.losses.mean_squared_error(self.tower_mel_targets[i],
-                                                              self.tower_decoder_output[i])
+                        before = tf.compat.v1.losses.mean_squared_error(self.tower_mel_targets[i],
+                                                                        self.tower_decoder_output[i])
                         # Compute loss after postnet
-                        after = tf.losses.mean_squared_error(self.tower_mel_targets[i],
-                                                             self.tower_mel_outputs[i])
+                        after = tf.compat.v1.losses.mean_squared_error(self.tower_mel_targets[i],
+                                                                       self.tower_mel_outputs[i])
                         # Compute <stop_token> loss (for learning dynamic generation stop)
                         stop_token_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
                             labels=self.tower_stop_token_targets[i],
@@ -439,7 +439,7 @@ class Tacotron():
         grad_device = "/cpu:0" if hp.tacotron_num_gpus > 1 else gpus[0]
         
         with tf.device(grad_device):
-            with tf.variable_scope("optimizer") as scope:
+            with tf.compat.v1.variable_scope("optimizer") as scope:
                 if hp.tacotron_decay_learning_rate:
                     self.decay_steps = hp.tacotron_decay_steps
                     self.decay_rate = hp.tacotron_decay_rate
@@ -448,16 +448,16 @@ class Tacotron():
                 else:
                     self.learning_rate = tf.convert_to_tensor(hp.tacotron_initial_learning_rate)
                 
-                optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.tacotron_adam_beta1,
-                                                   hp.tacotron_adam_beta2, hp.tacotron_adam_epsilon)
+                optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate, hp.tacotron_adam_beta1,
+                                                             hp.tacotron_adam_beta2, hp.tacotron_adam_epsilon)
         
         # 2. Compute Gradient
         for i in range(hp.tacotron_num_gpus):
             #  Device placement
-            with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
-                                                          worker_device=gpus[i])):
+            with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
+                                                                    worker_device=gpus[i])):
                 # agg_loss += self.tower_loss[i]
-                with tf.variable_scope("optimizer") as scope:
+                with tf.compat.v1.variable_scope("optimizer") as scope:
                     gradients = optimizer.compute_gradients(self.tower_loss[i])
                     tower_gradients.append(gradients)
         
@@ -490,7 +490,7 @@ class Tacotron():
             
             # Add dependency on UPDATE_OPS; otherwise batchnorm won"t work correctly. See:
             # https://github.com/tensorflow/tensorflow/issues/1122
-            with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
+            with tf.control_dependencies(tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)):
                 self.optimize = optimizer.apply_gradients(zip(clipped_gradients, vars),
                                                           global_step=global_step)
     
@@ -510,12 +510,12 @@ class Tacotron():
         hp = self._hparams
         
         # Compute natural exponential decay
-        lr = tf.train.exponential_decay(init_lr,
-                                        global_step - hp.tacotron_start_decay,
-                                        # lr = 1e-3 at step 50k
-                                        self.decay_steps,
-                                        self.decay_rate,  # lr = 1e-5 around step 310k
-                                        name="lr_exponential_decay")
+        lr = tf.compat.v1.train.exponential_decay(init_lr,
+                                                  global_step - hp.tacotron_start_decay,
+                                                  # lr = 1e-3 at step 50k
+                                                  self.decay_steps,
+                                                  self.decay_rate,  # lr = 1e-5 around step 310k
+                                                  name="lr_exponential_decay")
         
         # clip learning rate by max and min values (initial and final values)
         return tf.minimum(tf.maximum(lr, hp.tacotron_final_learning_rate), init_lr)
diff --git a/synthesizer/tacotron2.py b/synthesizer/tacotron2.py
index 4a5b199..e4c6850 100644
--- a/synthesizer/tacotron2.py
+++ b/synthesizer/tacotron2.py
@@ -12,13 +12,13 @@ class Tacotron2:
     def __init__(self, checkpoint_path, hparams, gta=False, model_name="Tacotron"):
         log("Constructing model: %s" % model_name)
         #Force the batch size to be known in order to use attention masking in batch synthesis
-        inputs = tf.placeholder(tf.int32, (None, None), name="inputs")
-        input_lengths = tf.placeholder(tf.int32, (None,), name="input_lengths")
-        speaker_embeddings = tf.placeholder(tf.float32, (None, hparams.speaker_embedding_size),
+        inputs = tf.compat.v1.placeholder(tf.int32, (None, None), name="inputs")
+        input_lengths = tf.compat.v1.placeholder(tf.int32, (None,), name="input_lengths")
+        speaker_embeddings = tf.compat.v1.placeholder(tf.float32, (None, hparams.speaker_embedding_size),
                                             name="speaker_embeddings")
-        targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name="mel_targets")
-        split_infos = tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos")
-        with tf.variable_scope("Tacotron_model") as scope:
+        targets = tf.compat.v1.placeholder(tf.float32, (None, None, hparams.num_mels), name="mel_targets")
+        split_infos = tf.compat.v1.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos")
+        with tf.compat.v1.variable_scope("Tacotron_model") as scope:
             self.model = create_model(model_name, hparams)
             if gta:
                 self.model.initialize(inputs, input_lengths, speaker_embeddings, targets, gta=gta,
@@ -52,14 +52,14 @@ class Tacotron2:
         
         log("Loading checkpoint: %s" % checkpoint_path)
         #Memory allocation on the GPUs as needed
-        config = tf.ConfigProto()
+        config = tf.compat.v1.ConfigProto()
         config.gpu_options.allow_growth = True
         config.allow_soft_placement = True
         
-        self.session = tf.Session(config=config)
-        self.session.run(tf.global_variables_initializer())
+        self.session = tf.compat.v1.Session(config=config)
+        self.session.run(tf.compat.v1.global_variables_initializer())
         
-        saver = tf.train.Saver()
+        saver = tf.compat.v1.train.Saver()
         saver.restore(self.session, checkpoint_path)
     
     def my_synthesize(self, speaker_embeds, texts):
diff --git a/synthesizer/train.py b/synthesizer/train.py
index 4fe6bbd..416c4a9 100644
--- a/synthesizer/train.py
+++ b/synthesizer/train.py
@@ -33,48 +33,48 @@ def add_embedding_stats(summary_writer, embedding_names, paths_to_meta, checkpoi
 
 
 def add_train_stats(model, hparams):
-    with tf.variable_scope("stats") as scope:
+    with tf.compat.v1.variable_scope("stats") as scope:
         for i in range(hparams.tacotron_num_gpus):
-            tf.summary.histogram("mel_outputs %d" % i, model.tower_mel_outputs[i])
-            tf.summary.histogram("mel_targets %d" % i, model.tower_mel_targets[i])
-        tf.summary.scalar("before_loss", model.before_loss)
-        tf.summary.scalar("after_loss", model.after_loss)
+            tf.compat.v1.summary.histogram("mel_outputs %d" % i, model.tower_mel_outputs[i])
+            tf.compat.v1.summary.histogram("mel_targets %d" % i, model.tower_mel_targets[i])
+        tf.compat.v1.summary.scalar("before_loss", model.before_loss)
+        tf.compat.v1.summary.scalar("after_loss", model.after_loss)
         
         if hparams.predict_linear:
-            tf.summary.scalar("linear_loss", model.linear_loss)
+            tf.compat.v1.summary.scalar("linear_loss", model.linear_loss)
             for i in range(hparams.tacotron_num_gpus):
-                tf.summary.histogram("mel_outputs %d" % i, model.tower_linear_outputs[i])
-                tf.summary.histogram("mel_targets %d" % i, model.tower_linear_targets[i])
+                tf.compat.v1.summary.histogram("mel_outputs %d" % i, model.tower_linear_outputs[i])
+                tf.compat.v1.summary.histogram("mel_targets %d" % i, model.tower_linear_targets[i])
         
-        tf.summary.scalar("regularization_loss", model.regularization_loss)
-        tf.summary.scalar("stop_token_loss", model.stop_token_loss)
-        tf.summary.scalar("loss", model.loss)
-        tf.summary.scalar("learning_rate", model.learning_rate)  # Control learning rate decay speed
+        tf.compat.v1.summary.scalar("regularization_loss", model.regularization_loss)
+        tf.compat.v1.summary.scalar("stop_token_loss", model.stop_token_loss)
+        tf.compat.v1.summary.scalar("loss", model.loss)
+        tf.compat.v1.summary.scalar("learning_rate", model.learning_rate)  # Control learning rate decay speed
         if hparams.tacotron_teacher_forcing_mode == "scheduled":
-            tf.summary.scalar("teacher_forcing_ratio", model.ratio)  # Control teacher forcing 
+            tf.compat.v1.summary.scalar("teacher_forcing_ratio", model.ratio)  # Control teacher forcing
         # ratio decay when mode = "scheduled"
-        gradient_norms = [tf.norm(grad) for grad in model.gradients]
-        tf.summary.histogram("gradient_norm", gradient_norms)
-        tf.summary.scalar("max_gradient_norm", tf.reduce_max(gradient_norms))  # visualize 
+        gradient_norms = [tf.norm(tensor=grad) for grad in model.gradients]
+        tf.compat.v1.summary.histogram("gradient_norm", gradient_norms)
+        tf.compat.v1.summary.scalar("max_gradient_norm", tf.reduce_max(input_tensor=gradient_norms))  # visualize
         # gradients (in case of explosion)
-        return tf.summary.merge_all()
+        return tf.compat.v1.summary.merge_all()
 
 
 def add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss,
                    loss):
     values = [
-        tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_before_loss",
-                         simple_value=before_loss),
-        tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_after_loss",
-                         simple_value=after_loss),
-        tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/stop_token_loss",
-                         simple_value=stop_token_loss),
-        tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_loss", simple_value=loss),
+        tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_before_loss",
+                                   simple_value=before_loss),
+        tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_after_loss",
+                                   simple_value=after_loss),
+        tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/stop_token_loss",
+                                   simple_value=stop_token_loss),
+        tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_loss", simple_value=loss),
     ]
     if linear_loss is not None:
-        values.append(tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_linear_loss",
-                                       simple_value=linear_loss))
-    test_summary = tf.Summary(value=values)
+        values.append(tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_linear_loss",
+                                                 simple_value=linear_loss))
+    test_summary = tf.compat.v1.Summary(value=values)
     summary_writer.add_summary(test_summary, step)
 
 
@@ -83,7 +83,7 @@ def time_string():
 
 
 def model_train_mode(args, feeder, hparams, global_step):
-    with tf.variable_scope("Tacotron_model", reuse=tf.AUTO_REUSE) as scope:
+    with tf.compat.v1.variable_scope("Tacotron_model", reuse=tf.compat.v1.AUTO_REUSE) as scope:
         model = create_model("Tacotron", hparams)
         model.initialize(feeder.inputs, feeder.input_lengths, feeder.speaker_embeddings, 
                          feeder.mel_targets, feeder.token_targets,
@@ -96,7 +96,7 @@ def model_train_mode(args, feeder, hparams, global_step):
 
 
 def model_test_mode(args, feeder, hparams, global_step):
-    with tf.variable_scope("Tacotron_model", reuse=tf.AUTO_REUSE) as scope:
+    with tf.compat.v1.variable_scope("Tacotron_model", reuse=tf.compat.v1.AUTO_REUSE) as scope:
         model = create_model("Tacotron", hparams)
         model.initialize(feeder.eval_inputs, feeder.eval_input_lengths, 
                          feeder.eval_speaker_embeddings, feeder.eval_mel_targets,
@@ -136,11 +136,11 @@ def train(log_dir, args, hparams):
     log(hparams_debug_string())
     
     # Start by setting a seed for repeatability
-    tf.set_random_seed(hparams.tacotron_random_seed)
+    tf.compat.v1.set_random_seed(hparams.tacotron_random_seed)
     
     # Set up data feeder
     coord = tf.train.Coordinator()
-    with tf.variable_scope("datafeeder") as scope:
+    with tf.compat.v1.variable_scope("datafeeder") as scope:
         feeder = Feeder(coord, metadat_fpath, hparams)
     
     # Set up model:
@@ -164,21 +164,21 @@ def train(log_dir, args, hparams):
     step = 0
     time_window = ValueWindow(100)
     loss_window = ValueWindow(100)
-    saver = tf.train.Saver(max_to_keep=5)
+    saver = tf.compat.v1.train.Saver(max_to_keep=5)
     
     log("Tacotron training set to a maximum of {} steps".format(args.tacotron_train_steps))
     
     # Memory allocation on the GPU as needed
-    config = tf.ConfigProto()
+    config = tf.compat.v1.ConfigProto()
     config.gpu_options.allow_growth = True
     config.allow_soft_placement = True
     
     # Train
-    with tf.Session(config=config) as sess:
+    with tf.compat.v1.Session(config=config) as sess:
         try:
-            summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)
+            summary_writer = tf.compat.v1.summary.FileWriter(tensorboard_dir, sess.graph)
             
-            sess.run(tf.global_variables_initializer())
+            sess.run(tf.compat.v1.global_variables_initializer())
             
             # saved model restoring
             if args.restore:
diff --git a/vocoder/inference.py b/vocoder/inference.py
index 19e639c..7e54684 100644
--- a/vocoder/inference.py
+++ b/vocoder/inference.py
@@ -6,7 +6,7 @@ import torch
 _model = None   # type: WaveRNN
 
 def load_model(weights_fpath, verbose=True):
-    global _model
+    global _model, _device
     
     if verbose:
         print("Building Wave-RNN")
@@ -23,11 +23,17 @@ def load_model(weights_fpath, verbose=True):
         hop_length=hp.hop_length,
         sample_rate=hp.sample_rate,
         mode=hp.voc_mode
-    ).cuda()
+    )
+
+    if torch.cuda.is_available():
+        _model = _model.cuda()
+        _device = torch.device('cuda')
+    else:
+        _device = torch.device('cpu')
     
     if verbose:
         print("Loading model weights at %s" % weights_fpath)
-    checkpoint = torch.load(weights_fpath)
+    checkpoint = torch.load(weights_fpath, _device)
     _model.load_state_dict(checkpoint['model_state'])
     _model.eval()
 
diff --git a/vocoder/models/fatchord_version.py b/vocoder/models/fatchord_version.py
index 798842d..429572b 100644
--- a/vocoder/models/fatchord_version.py
+++ b/vocoder/models/fatchord_version.py
@@ -157,7 +157,10 @@ class WaveRNN(nn.Module):
         rnn2 = self.get_gru_cell(self.rnn2)
 
         with torch.no_grad():
-            mels = mels.cuda()
+            if torch.cuda.is_available():
+                mels = mels.cuda()
+            else:
+                mels = mels.cpu()
             wave_len = (mels.size(-1) - 1) * self.hop_length
             mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side='both')
             mels, aux = self.upsample(mels.transpose(1, 2))
@@ -168,9 +171,14 @@ class WaveRNN(nn.Module):
 
             b_size, seq_len, _ = mels.size()
 
-            h1 = torch.zeros(b_size, self.rnn_dims).cuda()
-            h2 = torch.zeros(b_size, self.rnn_dims).cuda()
-            x = torch.zeros(b_size, 1).cuda()
+            if torch.cuda.is_available():
+                h1 = torch.zeros(b_size, self.rnn_dims).cuda()
+                h2 = torch.zeros(b_size, self.rnn_dims).cuda()
+                x = torch.zeros(b_size, 1).cuda()
+            else:
+                h1 = torch.zeros(b_size, self.rnn_dims).cpu()
+                h2 = torch.zeros(b_size, self.rnn_dims).cpu()
+                x = torch.zeros(b_size, 1).cpu()
 
             d = self.aux_dims
             aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(4)]
@@ -260,7 +268,10 @@ class WaveRNN(nn.Module):
         # i.e., it won't generalise to other shapes/dims
         b, t, c = x.size()
         total = t + 2 * pad if side == 'both' else t + pad
-        padded = torch.zeros(b, total, c).cuda()
+        if torch.cuda.is_available():
+            padded = torch.zeros(b, total, c).cuda()
+        else:
+            padded = torch.zeros(b, total, c).cpu()
         if side == 'before' or side == 'both':
             padded[:, pad:pad + t, :] = x
         elif side == 'after':
@@ -306,7 +317,10 @@ class WaveRNN(nn.Module):
             padding = target + 2 * overlap - remaining
             x = self.pad_tensor(x, padding, side='after')
 
-        folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda()
+        if torch.cuda.is_available():
+            folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda()
+        else:
+            folded = torch.zeros(num_folds, target + 2 * overlap, features).cpu()
 
         # Get the values for the folded tensor
         for i in range(num_folds):
-- 
GitLab