Cleanup pr 331 (#366)

[#366] Add CPU support. Also some updates for tensorflow v2 compatibility (in work) Co-authored-by: N pusalieth <pusalieth@users.noreply.github.com>

Cleanup pr 331 (#366)
[#366] Add CPU support. Also some updates for tensorflow v2 compatibility (in work) Co-authored-by: N pusalieth <pusalieth@users.noreply.github.com>
1b8d2e79 · blue-fish · GitHub · 5d6d9ff4 · 1b8d2e79 · 1b8d2e79
14 changed file
--- a/demo_cli.py
+++ b/demo_cli.py
@@ -5,6 +5,7 @@ from encoder import inference as encoder
 from vocoder import inference as vocoder
 from pathlib import Path
 import numpy as np
+import soundfile as sf
 import librosa
 import argparse
 import torch
@@ -30,6 +31,7 @@ if __name__ == '__main__':
        "overhead but allows to save some GPU memory for lower-end GPUs.")
    parser.add_argument("--no_sound", action="store_true", help=\
        "If True, audio won't be played.")
+    parser.add_argument("--cpu", help="Use CPU.", action="store_true")
    args = parser.parse_args()
    print_args(args, parser)
    if not args.no_sound:
@@ -38,22 +40,25 @@ if __name__ == '__main__':
    
    ## Print some environment information (for debugging purposes)
    print("Running a test of your configuration...\n")
-    if not torch.cuda.is_available():
-        print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
+    if args.cpu:
+        print("Using CPU for inference.")
+    elif torch.cuda.is_available():
+        device_id = torch.cuda.current_device()
+        gpu_properties = torch.cuda.get_device_properties(device_id)
+        print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
+            "%.1fGb total memory.\n" % 
+            (torch.cuda.device_count(),
+            device_id,
+            gpu_properties.name,
+            gpu_properties.major,
+            gpu_properties.minor,
+            gpu_properties.total_memory / 1e9))
+    else:
+        print("Your PyTorch installation is not configured. If you have a GPU ready "
              "for deep learning, ensure that the drivers are properly installed, and that your "
-              "CUDA version matches your PyTorch installation. CPU-only inference is currently "
-              "not supported.", file=sys.stderr)
+              "CUDA version matches your PyTorch installation.", file=sys.stderr)
+        print("\nIf you're trying to use a cpu, please use the option --cpu.", file=sys.stderr)
        quit(-1)
-    device_id = torch.cuda.current_device()
-    gpu_properties = torch.cuda.get_device_properties(device_id)
-    print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
-          "%.1fGb total memory.\n" % 
-          (torch.cuda.device_count(),
-           device_id,
-           gpu_properties.name,
-           gpu_properties.major,
-           gpu_properties.minor,
-           gpu_properties.total_memory / 1e9))
    
    
    ## Load the models one by one.
@@ -172,15 +177,13 @@ if __name__ == '__main__':
                sd.play(generated_wav, synthesizer.sample_rate)
                
            # Save it on the disk
-            fpath = "demo_output_%02d.wav" % num_generated
+            filename = "demo_output_%02d.wav" % num_generated
            print(generated_wav.dtype)
-            librosa.output.write_wav(fpath, generated_wav.astype(np.float32), 
-                                     synthesizer.sample_rate)
+            sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
            num_generated += 1
-            print("\nSaved output as %s\n\n" % fpath)
+            print("\nSaved output as %s\n\n" % filename)
            
            
        except Exception as e:
            print("Caught exception: %s" % repr(e))
            print("Restarting\n")
-        
\ No newline at end of file
--- a/encoder/inference.py
+++ b/encoder/inference.py
@@ -30,7 +30,7 @@ def load_model(weights_fpath: Path, device=None):
    elif isinstance(device, str):
        _device = torch.device(device)
    _model = SpeakerEncoder(_device, torch.device("cpu"))
-    checkpoint = torch.load(weights_fpath)
+    checkpoint = torch.load(weights_fpath, _device)
    _model.load_state_dict(checkpoint["model_state"])
    _model.eval()
    print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))

--- a/encoder/train.py
+++ b/encoder/train.py
@@ -7,11 +7,12 @@ from pathlib import Path
 import torch

 def sync(device: torch.device):
-    # FIXME
-    return 
    # For correct profiling (cuda operations are async)
    if device.type == "cuda":
        torch.cuda.synchronize(device)
+    else:
+        torch.cpu.synchronize(device)
+    

 def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
          backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
@@ -122,4 +123,3 @@ def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int,
            }, backup_fpath)
            
        profiler.tick("Extras (visualizations, saving)")
-        
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
-tensorflow-gpu>=1.10.0,<=1.14.0
+# each portion of tensorflow is needed
+# core package is for RNN, cpu and gpu are for specific system speed-ups
+tensorflow==1.15
+tensorflow-cpu==1.15
+tensorflow-gpu==1.15
 umap-learn
 visdom
 webrtcvad

--- a/synthesizer/feeder.py
+++ b/synthesizer/feeder.py
@@ -70,22 +70,22 @@ class Feeder:
 			# Create placeholders for inputs and targets. Don"t specify batch size because we want
 			# to be able to feed different batch sizes at eval time.
 			self._placeholders = [
-				tf.placeholder(tf.int32, shape=(None, None), name="inputs"),
-				tf.placeholder(tf.int32, shape=(None, ), name="input_lengths"),
-				tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), 
+				tf.compat.v1.placeholder(tf.int32, shape=(None, None), name="inputs"),
+				tf.compat.v1.placeholder(tf.int32, shape=(None, ), name="input_lengths"),
+				tf.compat.v1.placeholder(tf.float32, shape=(None, None, hparams.num_mels), 
 							   name="mel_targets"),
-				tf.placeholder(tf.float32, shape=(None, None), name="token_targets"),
-				tf.placeholder(tf.int32, shape=(None, ), name="targets_lengths"),
-				tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), 
+				tf.compat.v1.placeholder(tf.float32, shape=(None, None), name="token_targets"),
+				tf.compat.v1.placeholder(tf.int32, shape=(None, ), name="targets_lengths"),
+				tf.compat.v1.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), 
 							   name="split_infos"),
 				
 				# SV2TTS
-				tf.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size), 
+				tf.compat.v1.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size), 
 							   name="speaker_embeddings")
 			]

 			# Create queue for buffering data
-			queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, 
+			queue = tf.queue.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, 
 									 tf.int32, tf.int32, tf.float32], name="input_queue")
 			self._enqueue_op = queue.enqueue(self._placeholders)
 			self.inputs, self.input_lengths, self.mel_targets, self.token_targets, \
@@ -100,7 +100,7 @@ class Feeder:
 			self.speaker_embeddings.set_shape(self._placeholders[6].shape)

 			# Create eval queue for buffering eval data
-			eval_queue = tf.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32,  
+			eval_queue = tf.queue.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32,  
 										  tf.int32, tf.int32, tf.float32], name="eval_queue")
 			self._eval_enqueue_op = eval_queue.enqueue(self._placeholders)
 			self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, \

--- a/synthesizer/inference.py
+++ b/synthesizer/inference.py
@@ -54,7 +54,7 @@ class Synthesizer:
        """
        if self._low_mem:
            raise Exception("Cannot load the synthesizer permanently in low mem mode")
-        tf.reset_default_graph()
+        tf.compat.v1.reset_default_graph()
        self._model = Tacotron2(self.checkpoint_fpath, hparams)
            
    def synthesize_spectrograms(self, texts: List[str],
@@ -88,7 +88,7 @@ class Synthesizer:
    @staticmethod
    def _one_shot_synthesize_spectrograms(checkpoint_fpath, embeddings, texts):
        # Load the model and forward the inputs
-        tf.reset_default_graph()
+        tf.compat.v1.reset_default_graph()
        model = Tacotron2(checkpoint_fpath, hparams)
        specs, alignments = model.my_synthesize(embeddings, texts)
        
@@ -134,4 +134,3 @@ class Synthesizer:
        with the same parameters present in hparams.py.
        """
        return audio.inv_mel_spectrogram(mel, hparams)
-    
\ No newline at end of file
--- a/synthesizer/models/attention.py
+++ b/synthesizer/models/attention.py
@@ -60,10 +60,10 @@ def _location_sensitive_score(W_query, W_fil, W_keys):
 	dtype = W_query.dtype
 	num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1]

-	v_a = tf.get_variable(
+	v_a = tf.compat.v1.get_variable(
 		"attention_variable_projection", shape=[num_units], dtype=dtype,
 		initializer=tf.contrib.layers.xavier_initializer())
-	b_a = tf.get_variable(
+	b_a = tf.compat.v1.get_variable(
 		"attention_bias", shape=[num_units], dtype=dtype,
 		initializer=tf.zeros_initializer())

@@ -155,10 +155,10 @@ class LocationSensitiveAttention(BahdanauAttention):
 				probability_fn=normalization_function,
 				name=name)

-		self.location_convolution = tf.layers.Conv1D(filters=hparams.attention_filters,
+		self.location_convolution = tf.compat.v1.layers.Conv1D(filters=hparams.attention_filters,
 			kernel_size=hparams.attention_kernel, padding="same", use_bias=True,
 			bias_initializer=tf.zeros_initializer(), name="location_features_convolution")
-		self.location_layer = tf.layers.Dense(units=num_units, use_bias=False,
+		self.location_layer = tf.compat.v1.layers.Dense(units=num_units, use_bias=False,
 			dtype=tf.float32, name="location_features_layer")
 		self._cumulate = cumulate_weights


--- a/synthesizer/models/helpers.py
+++ b/synthesizer/models/helpers.py
@@ -119,7 +119,7 @@ class TacoTrainingHelper(Helper):

 			#Pick previous outputs randomly with respect to teacher forcing ratio
 			next_inputs = tf.cond(
-				tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio),
+				tf.less(tf.random.uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio),
 				lambda: self._targets[:, time, :], #Teacher-forcing: return true frame
 				lambda: outputs[:,-self._output_dim:])


--- a/synthesizer/models/modules.py
+++ b/synthesizer/models/modules.py
 import tensorflow as tf
+import torch


 class HighwayNet:
@@ -6,12 +7,12 @@ class HighwayNet:
        self.units = units
        self.scope = "HighwayNet" if name is None else name
        
-        self.H_layer = tf.layers.Dense(units=self.units, activation=tf.nn.relu, name="H")
-        self.T_layer = tf.layers.Dense(units=self.units, activation=tf.nn.sigmoid, name="T",
+        self.H_layer = tf.compat.v1.layers.Dense(units=self.units, activation=tf.nn.relu, name="H")
+        self.T_layer = tf.compat.v1.layers.Dense(units=self.units, activation=tf.nn.sigmoid, name="T",
                                       bias_initializer=tf.constant_initializer(-1.))
    
    def __call__(self, inputs):
-        with tf.variable_scope(self.scope):
+        with tf.compat.v1.variable_scope(self.scope):
            H = self.H_layer(inputs)
            T = self.T_layer(inputs)
            return H * T + inputs * (1. - T)
@@ -38,8 +39,8 @@ class CBHG:
        self._bw_cell = tf.nn.rnn_cell.GRUCell(rnn_units, name="{}_backward_RNN".format(self.scope))
    
    def __call__(self, inputs, input_lengths):
-        with tf.variable_scope(self.scope):
-            with tf.variable_scope("conv_bank"):
+        with tf.compat.v1.variable_scope(self.scope):
+            with tf.compat.v1.variable_scope("conv_bank"):
                # Convolution bank: concatenate on the last axis to stack channels from all 
                # convolutions
                # The convolution bank uses multiple different kernel sizes to have many insights 
@@ -71,7 +72,7 @@ class CBHG:
            # Additional projection in case of dimension mismatch (for HighwayNet "residual" 
 			# connection)
            if highway_input.shape[2] != self.highway_units:
-                highway_input = tf.layers.dense(highway_input, self.highway_units)
+                highway_input = tf.compat.v1.layers.Dense(highway_input, self.highway_units)
            
            # 4-layer HighwayNet
            for highwaynet in self.highwaynet_layers:
@@ -88,7 +89,7 @@ class CBHG:
            return tf.concat(outputs, axis=2)  # Concat forward and backward outputs


-class ZoneoutLSTMCell(tf.nn.rnn_cell.RNNCell):
+class ZoneoutLSTMCell(tf.compat.v1.nn.rnn_cell.RNNCell):
    """Wrapper for tf LSTM to create Zoneout LSTM Cell

    inspired by:
@@ -108,8 +109,11 @@ class ZoneoutLSTMCell(tf.nn.rnn_cell.RNNCell):
        
        if zm < 0. or zs > 1.:
            raise ValueError("One/both provided Zoneout factors are not in [0, 1]")
-        
-        self._cell = tf.nn.rnn_cell.LSTMCell(num_units, state_is_tuple=state_is_tuple, name=name)
+
+        if torch.cuda.is_available():
+            self._cell = tf.contrib.cudnn_rnn.CudnnLSTM(num_units, name=name)
+        else:
+            self._cell = tf.contrib.rnn.LSTMBlockCell(num_units, name=name)
        self._zoneout_cell = zoneout_factor_cell
        self._zoneout_outputs = zoneout_factor_output
        self.is_training = is_training
@@ -144,16 +148,13 @@ class ZoneoutLSTMCell(tf.nn.rnn_cell.RNNCell):
        if self.is_training:
            # nn.dropout takes keep_prob (probability to keep activations) not drop_prob (
 			# probability to mask activations)!
-            c = (1 - self._zoneout_cell) * tf.nn.dropout(new_c - prev_c,
-                                                         (1 - self._zoneout_cell)) + prev_c
-            h = (1 - self._zoneout_outputs) * tf.nn.dropout(new_h - prev_h,
-                                                            (1 - self._zoneout_outputs)) + prev_h
-        
+            c = (1 - self._zoneout_cell) * tf.nn.dropout(new_c - prev_c, (1 - self._zoneout_cell)) + prev_c
+            h = (1 - self._zoneout_outputs) * tf.nn.dropout(new_h - prev_h, (1 - self._zoneout_outputs)) + prev_h
        else:
            c = (1 - self._zoneout_cell) * new_c + self._zoneout_cell * prev_c
            h = (1 - self._zoneout_outputs) * new_h + self._zoneout_outputs * prev_h
        
-        new_state = tf.nn.rnn_cell.LSTMStateTuple(c, h) if self.state_is_tuple else tf.concat(1, [c,
+        new_state = tf.compat.v1.nn.rnn_cell.LSTMStateTuple(c, h) if self.state_is_tuple else tf.concat(1, [c,
                                                                                                  h])
        
        return output, new_state
@@ -175,7 +176,7 @@ class EncoderConvolutions:
        """
        super(EncoderConvolutions, self).__init__()
        self.is_training = is_training
-        
+
        self.kernel_size = hparams.enc_conv_kernel_size
        self.channels = hparams.enc_conv_channels
        self.activation = activation
@@ -184,7 +185,7 @@ class EncoderConvolutions:
        self.enc_conv_num_layers = hparams.enc_conv_num_layers
    
    def __call__(self, inputs):
-        with tf.variable_scope(self.scope):
+        with tf.compat.v1.variable_scope(self.scope):
            x = inputs
            for i in range(self.enc_conv_num_layers):
                x = conv1d(x, self.kernel_size, self.channels, self.activation,
@@ -226,8 +227,8 @@ class EncoderRNN:
                                        name="encoder_bw_LSTM")
    
    def __call__(self, inputs, input_lengths):
-        with tf.variable_scope(self.scope):
-            outputs, (fw_state, bw_state) = tf.nn.bidirectional_dynamic_rnn(
+        with tf.compat.v1.variable_scope(self.scope):
+            outputs, (fw_state, bw_state) = tf.compat.v1.nn.bidirectional_dynamic_rnn(
                self._fw_cell,
                self._bw_cell,
                inputs,
@@ -239,7 +240,8 @@ class EncoderRNN:


 class Prenet:
-    """Two fully connected layers used as an information bottleneck for the attention.
+    """
+        Two fully connected layers used as an information bottleneck for the attention.
    """
    
    def __init__(self, is_training, layers_sizes=[256, 256], drop_rate=0.5, activation=tf.nn.relu,
@@ -263,13 +265,13 @@ class Prenet:
    def __call__(self, inputs):
        x = inputs
        
-        with tf.variable_scope(self.scope):
+        with tf.compat.v1.variable_scope(self.scope):
            for i, size in enumerate(self.layers_sizes):
-                dense = tf.layers.dense(x, units=size, activation=self.activation,
+                dense = tf.compat.v1.layers.dense(x, units=size, activation=self.activation,
                                        name="dense_{}".format(i + 1))
                # The paper discussed introducing diversity in generation at inference time
                # by using a dropout of 0.5 only in prenet layers (in both training and inference).
-                x = tf.layers.dropout(dense, rate=self.drop_rate, training=True,
+                x = tf.compat.v1.layers.dropout(dense, rate=self.drop_rate, training=True,
                                      name="dropout_{}".format(i + 1) + self.scope)
        return x

@@ -302,10 +304,10 @@ class DecoderRNN:
                                           name="decoder_LSTM_{}".format(i + 1)) for i in
                           range(layers)]
        
-        self._cell = tf.contrib.rnn.MultiRNNCell(self.rnn_layers, state_is_tuple=True)
+        self._cell = tf.compat.v1.nn.rnn_cell.MultiRNNCell(self.rnn_layers, state_is_tuple=True)
    
    def __call__(self, inputs, states):
-        with tf.variable_scope(self.scope):
+        with tf.compat.v1.variable_scope(self.scope):
            return self._cell(inputs, states)


@@ -327,14 +329,14 @@ class FrameProjection:
        self.activation = activation
        
        self.scope = "Linear_projection" if scope is None else scope
-        self.dense = tf.layers.Dense(units=shape, activation=activation,
+        self.dense = tf.compat.v1.layers.Dense(units=shape, activation=activation,
                                     name="projection_{}".format(self.scope))
    
    def __call__(self, inputs):
-        with tf.variable_scope(self.scope):
+        with tf.compat.v1.variable_scope(self.scope):
            # If activation==None, this returns a simple Linear projection
            # else the projection will be passed through an activation function
-            # output = tf.layers.dense(inputs, units=self.shape, activation=self.activation,
+            # output = tf.compat.v1.layers.Dense(inputs, units=self.shape, activation=self.activation,
            # 	name="projection_{}".format(self.scope))
            output = self.dense(inputs)
            
@@ -362,7 +364,7 @@ class StopProjection:
        self.scope = "stop_token_projection" if scope is None else scope
    
    def __call__(self, inputs):
-        with tf.variable_scope(self.scope):
+        with tf.compat.v1.variable_scope(self.scope):
            output = tf.layers.dense(inputs, units=self.shape,
                                     activation=None, name="projection_{}".format(self.scope))
            
@@ -399,7 +401,7 @@ class Postnet:
        self.drop_rate = hparams.tacotron_dropout_rate
    
    def __call__(self, inputs):
-        with tf.variable_scope(self.scope):
+        with tf.compat.v1.variable_scope(self.scope):
            x = inputs
            for i in range(self.postnet_num_layers - 1):
                x = conv1d(x, self.kernel_size, self.channels, self.activation,
@@ -412,16 +414,16 @@ class Postnet:


 def conv1d(inputs, kernel_size, channels, activation, is_training, drop_rate, scope):
-    with tf.variable_scope(scope):
-        conv1d_output = tf.layers.conv1d(
+    with tf.compat.v1.variable_scope(scope):
+        conv1d_output = tf.compat.v1.layers.conv1d(
            inputs,
            filters=channels,
            kernel_size=kernel_size,
            activation=None,
            padding="same")
-        batched = tf.layers.batch_normalization(conv1d_output, training=is_training)
+        batched = tf.compat.v1.layers.batch_normalization(conv1d_output, training=is_training)
        activated = activation(batched)
-        return tf.layers.dropout(activated, rate=drop_rate, training=is_training,
+        return tf.compat.v1.layers.dropout(activated, rate=drop_rate, training=is_training,
                                 name="dropout_{}".format(scope))



--- a/synthesizer/models/tacotron.py
+++ b/synthesizer/models/tacotron.py
@@ -83,11 +83,11 @@ class Tacotron():
            
            ##############
            
-            p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int)
-            p_mel_targets = tf.py_func(split_func, [mel_targets, split_infos[:, 1]],
-                                       lout_float) if mel_targets is not None else mel_targets
-            p_stop_token_targets = tf.py_func(split_func, [stop_token_targets, split_infos[:, 2]],
-                                              lout_float) if stop_token_targets is not None else \
+            p_inputs = tf.numpy_function(split_func, [inputs, split_infos[:, 0]], lout_int)
+            p_mel_targets = tf.numpy_function(split_func, [mel_targets, split_infos[:, 1]],
+                                              lout_float) if mel_targets is not None else mel_targets
+            p_stop_token_targets = tf.numpy_function(split_func, [stop_token_targets, split_infos[:, 2]],
+                                                     lout_float) if stop_token_targets is not None else \
 				stop_token_targets
            
            tower_inputs = []
@@ -120,9 +120,9 @@ class Tacotron():
        gpus = ["/gpu:{}".format(i) for i in
                range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)]
        for i in range(hp.tacotron_num_gpus):
-            with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
-                                                          worker_device=gpus[i])):
-                with tf.variable_scope("inference") as scope:
+            with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
+                                                                    worker_device=gpus[i])):
+                with tf.compat.v1.variable_scope("inference") as scope:
                    assert hp.tacotron_teacher_forcing_mode in ("constant", "scheduled")
                    if hp.tacotron_teacher_forcing_mode == "scheduled" and is_training:
                        assert global_step is not None
@@ -132,7 +132,7 @@ class Tacotron():
                    post_condition = hp.predict_linear and not gta
                    
                    # Embeddings ==> [batch_size, sequence_length, embedding_dim]
-                    self.embedding_table = tf.get_variable(
+                    self.embedding_table = tf.compat.v1.get_variable(
                        "inputs_embedding", [len(symbols), hp.embedding_dim], dtype=tf.float32)
                    embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, tower_inputs[i])
                    
@@ -283,7 +283,7 @@ class Tacotron():
        self.tower_targets_lengths = tower_targets_lengths
        self.tower_stop_token_targets = tower_stop_token_targets
        
-        self.all_vars = tf.trainable_variables()
+        self.all_vars = tf.compat.v1.trainable_variables()
        
        log("Initialized Tacotron model. Dimensions (? = dynamic shape): ")
        log("  Train mode:               {}".format(is_training))
@@ -331,9 +331,9 @@ class Tacotron():
                range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)]
        
        for i in range(hp.tacotron_num_gpus):
-            with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
-                                                          worker_device=gpus[i])):
-                with tf.variable_scope("loss") as scope:
+            with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
+                                                                    worker_device=gpus[i])):
+                with tf.compat.v1.variable_scope("loss") as scope:
                    if hp.mask_decoder:
                        # Compute loss of predictions before postnet
                        before = MaskedMSE(self.tower_mel_targets[i], self.tower_decoder_output[i],
@@ -356,11 +356,11 @@ class Tacotron():
                        linear_loss = 0.
                    else:
                        # Compute loss of predictions before postnet
-                        before = tf.losses.mean_squared_error(self.tower_mel_targets[i],
-                                                              self.tower_decoder_output[i])
+                        before = tf.compat.v1.losses.mean_squared_error(self.tower_mel_targets[i],
+                                                                        self.tower_decoder_output[i])
                        # Compute loss after postnet
-                        after = tf.losses.mean_squared_error(self.tower_mel_targets[i],
-                                                             self.tower_mel_outputs[i])
+                        after = tf.compat.v1.losses.mean_squared_error(self.tower_mel_targets[i],
+                                                                       self.tower_mel_outputs[i])
                        # Compute <stop_token> loss (for learning dynamic generation stop)
                        stop_token_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
                            labels=self.tower_stop_token_targets[i],
@@ -439,7 +439,7 @@ class Tacotron():
        grad_device = "/cpu:0" if hp.tacotron_num_gpus > 1 else gpus[0]
        
        with tf.device(grad_device):
-            with tf.variable_scope("optimizer") as scope:
+            with tf.compat.v1.variable_scope("optimizer") as scope:
                if hp.tacotron_decay_learning_rate:
                    self.decay_steps = hp.tacotron_decay_steps
                    self.decay_rate = hp.tacotron_decay_rate
@@ -448,16 +448,16 @@ class Tacotron():
                else:
                    self.learning_rate = tf.convert_to_tensor(hp.tacotron_initial_learning_rate)
                
-                optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.tacotron_adam_beta1,
-                                                   hp.tacotron_adam_beta2, hp.tacotron_adam_epsilon)
+                optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate, hp.tacotron_adam_beta1,
+                                                             hp.tacotron_adam_beta2, hp.tacotron_adam_epsilon)
        
        # 2. Compute Gradient
        for i in range(hp.tacotron_num_gpus):
            #  Device placement
-            with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
-                                                          worker_device=gpus[i])):
+            with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
+                                                                    worker_device=gpus[i])):
                # agg_loss += self.tower_loss[i]
-                with tf.variable_scope("optimizer") as scope:
+                with tf.compat.v1.variable_scope("optimizer") as scope:
                    gradients = optimizer.compute_gradients(self.tower_loss[i])
                    tower_gradients.append(gradients)
        
@@ -490,7 +490,7 @@ class Tacotron():
            
            # Add dependency on UPDATE_OPS; otherwise batchnorm won"t work correctly. See:
            # https://github.com/tensorflow/tensorflow/issues/1122
-            with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
+            with tf.control_dependencies(tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)):
                self.optimize = optimizer.apply_gradients(zip(clipped_gradients, vars),
                                                          global_step=global_step)
    
@@ -510,12 +510,12 @@ class Tacotron():
        hp = self._hparams
        
        # Compute natural exponential decay
-        lr = tf.train.exponential_decay(init_lr,
-                                        global_step - hp.tacotron_start_decay,
-                                        # lr = 1e-3 at step 50k
-                                        self.decay_steps,
-                                        self.decay_rate,  # lr = 1e-5 around step 310k
-                                        name="lr_exponential_decay")
+        lr = tf.compat.v1.train.exponential_decay(init_lr,
+                                                  global_step - hp.tacotron_start_decay,
+                                                  # lr = 1e-3 at step 50k
+                                                  self.decay_steps,
+                                                  self.decay_rate,  # lr = 1e-5 around step 310k
+                                                  name="lr_exponential_decay")
        
        # clip learning rate by max and min values (initial and final values)
        return tf.minimum(tf.maximum(lr, hp.tacotron_final_learning_rate), init_lr)
--- a/synthesizer/tacotron2.py
+++ b/synthesizer/tacotron2.py
@@ -12,13 +12,13 @@ class Tacotron2:
    def __init__(self, checkpoint_path, hparams, gta=False, model_name="Tacotron"):
        log("Constructing model: %s" % model_name)
        #Force the batch size to be known in order to use attention masking in batch synthesis
-        inputs = tf.placeholder(tf.int32, (None, None), name="inputs")
-        input_lengths = tf.placeholder(tf.int32, (None,), name="input_lengths")
-        speaker_embeddings = tf.placeholder(tf.float32, (None, hparams.speaker_embedding_size),
+        inputs = tf.compat.v1.placeholder(tf.int32, (None, None), name="inputs")
+        input_lengths = tf.compat.v1.placeholder(tf.int32, (None,), name="input_lengths")
+        speaker_embeddings = tf.compat.v1.placeholder(tf.float32, (None, hparams.speaker_embedding_size),
                                            name="speaker_embeddings")
-        targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name="mel_targets")
-        split_infos = tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos")
-        with tf.variable_scope("Tacotron_model") as scope:
+        targets = tf.compat.v1.placeholder(tf.float32, (None, None, hparams.num_mels), name="mel_targets")
+        split_infos = tf.compat.v1.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos")
+        with tf.compat.v1.variable_scope("Tacotron_model") as scope:
            self.model = create_model(model_name, hparams)
            if gta:
                self.model.initialize(inputs, input_lengths, speaker_embeddings, targets, gta=gta,
@@ -52,14 +52,14 @@ class Tacotron2:
        
        log("Loading checkpoint: %s" % checkpoint_path)
        #Memory allocation on the GPUs as needed
-        config = tf.ConfigProto()
+        config = tf.compat.v1.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        
-        self.session = tf.Session(config=config)
-        self.session.run(tf.global_variables_initializer())
+        self.session = tf.compat.v1.Session(config=config)
+        self.session.run(tf.compat.v1.global_variables_initializer())
        
-        saver = tf.train.Saver()
+        saver = tf.compat.v1.train.Saver()
        saver.restore(self.session, checkpoint_path)
    
    def my_synthesize(self, speaker_embeds, texts):

--- a/synthesizer/train.py
+++ b/synthesizer/train.py
@@ -33,48 +33,48 @@ def add_embedding_stats(summary_writer, embedding_names, paths_to_meta, checkpoi


 def add_train_stats(model, hparams):
-    with tf.variable_scope("stats") as scope:
+    with tf.compat.v1.variable_scope("stats") as scope:
        for i in range(hparams.tacotron_num_gpus):
-            tf.summary.histogram("mel_outputs %d" % i, model.tower_mel_outputs[i])
-            tf.summary.histogram("mel_targets %d" % i, model.tower_mel_targets[i])
-        tf.summary.scalar("before_loss", model.before_loss)
-        tf.summary.scalar("after_loss", model.after_loss)
+            tf.compat.v1.summary.histogram("mel_outputs %d" % i, model.tower_mel_outputs[i])
+            tf.compat.v1.summary.histogram("mel_targets %d" % i, model.tower_mel_targets[i])
+        tf.compat.v1.summary.scalar("before_loss", model.before_loss)
+        tf.compat.v1.summary.scalar("after_loss", model.after_loss)
        
        if hparams.predict_linear:
-            tf.summary.scalar("linear_loss", model.linear_loss)
+            tf.compat.v1.summary.scalar("linear_loss", model.linear_loss)
            for i in range(hparams.tacotron_num_gpus):
-                tf.summary.histogram("mel_outputs %d" % i, model.tower_linear_outputs[i])
-                tf.summary.histogram("mel_targets %d" % i, model.tower_linear_targets[i])
+                tf.compat.v1.summary.histogram("mel_outputs %d" % i, model.tower_linear_outputs[i])
+                tf.compat.v1.summary.histogram("mel_targets %d" % i, model.tower_linear_targets[i])
        
-        tf.summary.scalar("regularization_loss", model.regularization_loss)
-        tf.summary.scalar("stop_token_loss", model.stop_token_loss)
-        tf.summary.scalar("loss", model.loss)
-        tf.summary.scalar("learning_rate", model.learning_rate)  # Control learning rate decay speed
+        tf.compat.v1.summary.scalar("regularization_loss", model.regularization_loss)
+        tf.compat.v1.summary.scalar("stop_token_loss", model.stop_token_loss)
+        tf.compat.v1.summary.scalar("loss", model.loss)
+        tf.compat.v1.summary.scalar("learning_rate", model.learning_rate)  # Control learning rate decay speed
        if hparams.tacotron_teacher_forcing_mode == "scheduled":
-            tf.summary.scalar("teacher_forcing_ratio", model.ratio)  # Control teacher forcing 
+            tf.compat.v1.summary.scalar("teacher_forcing_ratio", model.ratio)  # Control teacher forcing
        # ratio decay when mode = "scheduled"
-        gradient_norms = [tf.norm(grad) for grad in model.gradients]
-        tf.summary.histogram("gradient_norm", gradient_norms)
-        tf.summary.scalar("max_gradient_norm", tf.reduce_max(gradient_norms))  # visualize 
+        gradient_norms = [tf.norm(tensor=grad) for grad in model.gradients]
+        tf.compat.v1.summary.histogram("gradient_norm", gradient_norms)
+        tf.compat.v1.summary.scalar("max_gradient_norm", tf.reduce_max(input_tensor=gradient_norms))  # visualize
        # gradients (in case of explosion)
-        return tf.summary.merge_all()
+        return tf.compat.v1.summary.merge_all()


 def add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss,
                   loss):
    values = [
-        tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_before_loss",
-                         simple_value=before_loss),
-        tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_after_loss",
-                         simple_value=after_loss),
-        tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/stop_token_loss",
-                         simple_value=stop_token_loss),
-        tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_loss", simple_value=loss),
+        tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_before_loss",
+                                   simple_value=before_loss),
+        tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_after_loss",
+                                   simple_value=after_loss),
+        tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/stop_token_loss",
+                                   simple_value=stop_token_loss),
+        tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_loss", simple_value=loss),
    ]
    if linear_loss is not None:
-        values.append(tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_linear_loss",
-                                       simple_value=linear_loss))
-    test_summary = tf.Summary(value=values)
+        values.append(tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_linear_loss",
+                                                 simple_value=linear_loss))
+    test_summary = tf.compat.v1.Summary(value=values)
    summary_writer.add_summary(test_summary, step)


@@ -83,7 +83,7 @@ def time_string():


 def model_train_mode(args, feeder, hparams, global_step):
-    with tf.variable_scope("Tacotron_model", reuse=tf.AUTO_REUSE) as scope:
+    with tf.compat.v1.variable_scope("Tacotron_model", reuse=tf.compat.v1.AUTO_REUSE) as scope:
        model = create_model("Tacotron", hparams)
        model.initialize(feeder.inputs, feeder.input_lengths, feeder.speaker_embeddings, 
                         feeder.mel_targets, feeder.token_targets,
@@ -96,7 +96,7 @@ def model_train_mode(args, feeder, hparams, global_step):


 def model_test_mode(args, feeder, hparams, global_step):
-    with tf.variable_scope("Tacotron_model", reuse=tf.AUTO_REUSE) as scope:
+    with tf.compat.v1.variable_scope("Tacotron_model", reuse=tf.compat.v1.AUTO_REUSE) as scope:
        model = create_model("Tacotron", hparams)
        model.initialize(feeder.eval_inputs, feeder.eval_input_lengths, 
                         feeder.eval_speaker_embeddings, feeder.eval_mel_targets,
@@ -136,11 +136,11 @@ def train(log_dir, args, hparams):
    log(hparams_debug_string())
    
    # Start by setting a seed for repeatability
-    tf.set_random_seed(hparams.tacotron_random_seed)
+    tf.compat.v1.set_random_seed(hparams.tacotron_random_seed)
    
    # Set up data feeder
    coord = tf.train.Coordinator()
-    with tf.variable_scope("datafeeder") as scope:
+    with tf.compat.v1.variable_scope("datafeeder") as scope:
        feeder = Feeder(coord, metadat_fpath, hparams)
    
    # Set up model:
@@ -164,21 +164,21 @@ def train(log_dir, args, hparams):
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
-    saver = tf.train.Saver(max_to_keep=5)
+    saver = tf.compat.v1.train.Saver(max_to_keep=5)
    
    log("Tacotron training set to a maximum of {} steps".format(args.tacotron_train_steps))
    
    # Memory allocation on the GPU as needed
-    config = tf.ConfigProto()
+    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True
    
    # Train
-    with tf.Session(config=config) as sess:
+    with tf.compat.v1.Session(config=config) as sess:
        try:
-            summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)
+            summary_writer = tf.compat.v1.summary.FileWriter(tensorboard_dir, sess.graph)
            
-            sess.run(tf.global_variables_initializer())
+            sess.run(tf.compat.v1.global_variables_initializer())
            
            # saved model restoring
            if args.restore:

--- a/vocoder/inference.py
+++ b/vocoder/inference.py
@@ -6,7 +6,7 @@ import torch
 _model = None   # type: WaveRNN

 def load_model(weights_fpath, verbose=True):
-    global _model
+    global _model, _device
    
    if verbose:
        print("Building Wave-RNN")
@@ -23,11 +23,17 @@ def load_model(weights_fpath, verbose=True):
        hop_length=hp.hop_length,
        sample_rate=hp.sample_rate,
        mode=hp.voc_mode
-    ).cuda()
+    )
+
+    if torch.cuda.is_available():
+        _model = _model.cuda()
+        _device = torch.device('cuda')
+    else:
+        _device = torch.device('cpu')
    
    if verbose:
        print("Loading model weights at %s" % weights_fpath)
-    checkpoint = torch.load(weights_fpath)
+    checkpoint = torch.load(weights_fpath, _device)
    _model.load_state_dict(checkpoint['model_state'])
    _model.eval()


--- a/vocoder/models/fatchord_version.py
+++ b/vocoder/models/fatchord_version.py
@@ -157,7 +157,10 @@ class WaveRNN(nn.Module):
        rnn2 = self.get_gru_cell(self.rnn2)

        with torch.no_grad():
-            mels = mels.cuda()
+            if torch.cuda.is_available():
+                mels = mels.cuda()
+            else:
+                mels = mels.cpu()
            wave_len = (mels.size(-1) - 1) * self.hop_length
            mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side='both')
            mels, aux = self.upsample(mels.transpose(1, 2))
@@ -168,9 +171,14 @@ class WaveRNN(nn.Module):

            b_size, seq_len, _ = mels.size()

-            h1 = torch.zeros(b_size, self.rnn_dims).cuda()
-            h2 = torch.zeros(b_size, self.rnn_dims).cuda()
-            x = torch.zeros(b_size, 1).cuda()
+            if torch.cuda.is_available():
+                h1 = torch.zeros(b_size, self.rnn_dims).cuda()
+                h2 = torch.zeros(b_size, self.rnn_dims).cuda()
+                x = torch.zeros(b_size, 1).cuda()
+            else:
+                h1 = torch.zeros(b_size, self.rnn_dims).cpu()
+                h2 = torch.zeros(b_size, self.rnn_dims).cpu()
+                x = torch.zeros(b_size, 1).cpu()

            d = self.aux_dims
            aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(4)]
@@ -260,7 +268,10 @@ class WaveRNN(nn.Module):
        # i.e., it won't generalise to other shapes/dims
        b, t, c = x.size()
        total = t + 2 * pad if side == 'both' else t + pad
-        padded = torch.zeros(b, total, c).cuda()
+        if torch.cuda.is_available():
+            padded = torch.zeros(b, total, c).cuda()
+        else:
+            padded = torch.zeros(b, total, c).cpu()
        if side == 'before' or side == 'both':
            padded[:, pad:pad + t, :] = x
        elif side == 'after':
@@ -306,7 +317,10 @@ class WaveRNN(nn.Module):
            padding = target + 2 * overlap - remaining
            x = self.pad_tensor(x, padding, side='after')

-        folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda()
+        if torch.cuda.is_available():
+            folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda()
+        else:
+            folded = torch.zeros(num_folds, target + 2 * overlap, features).cpu()

        # Get the values for the folded tensor
        for i in range(num_folds):