Moved tacotron2 under SV2TTS

9a2f1ddd · Corentin Jemine · 6190c6e9 · 9a2f1ddd · 9a2f1ddd · 9a2f1ddd
43 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -14,5 +14,5 @@
 *.bcf
 _old
 SV2TTS/encoder/saved_models/*_backups
-tacotron2/logs-*
+SV2TTS/synthesizer/saved_models/*
 wave-rnn/model_outputs
\ No newline at end of file
--- a/LICENSE.txt
+++ b/LICENSE.txt
 MIT License

-Original work Copyright (c) 2018 Rayhane Mama
-Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
 Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
+Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah)
+Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

--- a/SV2TTS/inference_demo.py
+++ b/SV2TTS/inference_demo.py
 from time import perf_counter
 from encoder import inference
-from config import *
 from encoder.params_data import sampling_rate
+from pathlib import Path
+import torch

 if __name__ == '__main__':
    fpath = r"E:\Datasets\LibriSpeech\train-other-500\149\125760\149-125760-0003.flac"
-    wave = inference.load_and_preprocess_wave(fpath)
+    wav = inference.load_preprocess_waveform(fpath)
+
+    models_dir = Path("encoder/saved_models")
+    model_fpath = models_dir.joinpath("all.pt")
+    torch.cuda.synchronize()
    
-    model_fpath = fileio.join(model_dir, "all.pt") 
    start = perf_counter()
-    inference.load_model(model_fpath, device)
+    inference.load_model(model_fpath)
    print("Loaded model in %.2fs" % (perf_counter() - start))
+    torch.cuda.synchronize()
    
-    duration = len(wave) / sampling_rate
+    duration = len(wav) / sampling_rate
    start = perf_counter()
-    embed = inference.embed_utterance(wave)
+    embed = inference.embed_utterance(wav)
+    torch.cuda.synchronize()
    print("Processed %.2fs long utterance in %.2fs" % (duration, perf_counter() - start))
    
    start = perf_counter()
-    embed = inference.embed_utterance(wave)
+    embed = inference.embed_utterance(wav)
+    torch.cuda.synchronize()
    print("Processed %.2fs long utterance in %.2fs" % (duration, perf_counter() - start))
    
    print(embed)
--- a/tacotron2/inference_demo.py
+++ b/tacotron2/inference_demo.py
 from datasets.audio import inv_mel_spectrogram
 from tacotron import synthesizer
-from hparams import hparams
+from synthesizer.hparams import hparams
 from vlibs import fileio
 import sounddevice as sd
 import tensorflow as tf

--- a/SV2TTS/umap_demo.py
+++ b/SV2TTS/umap_demo.py
@@ -211,7 +211,7 @@ class UMapDemoUI(QDialog):
                                self.dataset_box.currentText(),
                                self.speaker_box.currentText(),
                                self.utterance_box.currentText())
-        self.utterance = inference.load_and_preprocess_wave(fpath)
+        self.utterance = inference.load_preprocess_waveform(fpath)
        self.is_record = False

    def embed_utterance(self, demo, speaker_name=None, go_next=None):

--- a/SV2TTS/encoder/inference.py
+++ b/SV2TTS/encoder/inference.py
@@ -9,17 +9,20 @@ from matplotlib import cm
 _model = None # type: SpeakerEncoder
 _device = None # type: torch.device

-def load_model(weights_fpath, device):
+def load_model(weights_fpath, device=None):
    """
    Loads the model in memory. If this function is not explicitely called, it will be run on the 
    first call to embed_frames() with the default weights file.
    
    :param weights_fpath: the path to saved model weights.
    :param device: either a torch device or the name of a torch device (e.g. 'cpu', 'cuda'). The 
-    model will be loaded and will run on this device. Outputs will however always be on the cpu.
+    model will be loaded and will run on this device. Outputs will however always be on the cpu. 
+    If None, will default to your GPU if it's available, otherwise your CPU.
    """
    global _model, _device
-    if isinstance(device, str):
+    if device is None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    elif isinstance(device, str):
        device = torch.device(device)
    _device = device
    _model = SpeakerEncoder(_device)
@@ -76,28 +79,28 @@ def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_fram
    frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)

    # Compute the slices
-    wave_slices, mel_slices = [], []
+    wav_slices, mel_slices = [], []
    steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
    for i in range(0, steps, frame_step):
        mel_range = np.array([i, i + partial_utterance_n_frames])
-        wave_range = mel_range * samples_per_frame
+        wav_range = mel_range * samples_per_frame
        mel_slices.append(slice(*mel_range))
-        wave_slices.append(slice(*wave_range))
+        wav_slices.append(slice(*wav_range))
        
    # Evaluate whether extra padding is warranted or not
-    last_wave_range = wave_slices[-1]
-    coverage = (n_samples - last_wave_range.start) / (last_wave_range.stop - last_wave_range.start)
+    last_wav_range = wav_slices[-1]
+    coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
    if coverage < min_pad_coverage and len(mel_slices) > 1:
        mel_slices = mel_slices[:-1]
-        wave_slices = wave_slices[:-1]
+        wav_slices = wav_slices[:-1]
    
-    return wave_slices, mel_slices
+    return wav_slices, mel_slices

-def embed_utterance(wave, using_partials=True, return_partials=False, **kwargs):
+def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
    """
    Computes an embedding for a single utterance.
    
-    :param wave: the utterance waveform as a numpy array of float32
+    :param wav: the utterance waveform as a numpy array of float32
    :param using_partials: if True, then the utterance is split in partial utterances of 
    <partial_utterance_n_frames> frames and the utterance embedding is computed from their 
    normalized average. If False, the utterance is instead computed from feeding the entire 
@@ -113,20 +116,20 @@ def embed_utterance(wave, using_partials=True, return_partials=False, **kwargs):
    """
    # Process the entire utterance if not using partials
    if not using_partials:
-        frames = audio.wave_to_mel_filterbank(wave)
+        frames = audio.wave_to_mel_filterbank(wav)
        embed = embed_frames_batch(frames[None, ...])[0]
        if return_partials:
            return embed, None, None
        return embed
    
    # Compute where to split the utterance into partials and pad if necessary
-    wave_slices, mel_slices = compute_partial_slices(len(wave), **kwargs)
+    wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
    max_wave_length = wave_slices[-1].stop
-    if max_wave_length >= len(wave):
-        wave = np.pad(wave, (0, max_wave_length - len(wave)), 'constant')
+    if max_wave_length >= len(wav):
+        wav = np.pad(wav, (0, max_wave_length - len(wav)), 'constant')
    
    # Split the utterance into partials
-    frames = audio.wave_to_mel_filterbank(wave)
+    frames = audio.wave_to_mel_filterbank(wav)
    frames_batch = np.array([frames[s] for s in mel_slices])
    partial_embeds = embed_frames_batch(frames_batch)
    
@@ -137,14 +140,11 @@ def embed_utterance(wave, using_partials=True, return_partials=False, **kwargs):
    if return_partials:
        return embed, partial_embeds, wave_slices
    return embed
-    
-def embed_stream(stream, partial_utterance_n_frames=partials_n_frames, overlap=0.5):
-    raise NotImplemented()

-def embed_speaker(waves, normalize=False, **kwargs):
+def embed_speaker(wavs, normalize=False, **kwargs):
    raise NotImplemented()

-def load_and_preprocess_wave(fpath):
+def load_preprocess_waveform(fpath):
    """
    Loads an audio file in memory and applies the same preprocessing operations used in trained 
    the Speaker Encoder. Using this function is not mandatory but recommended.
@@ -178,16 +178,16 @@ if __name__ == '__main__':
    fig, axes = plt.subplots(3, 3)
    for i, ax in enumerate(axes.flatten(), 50):
        fpath = r"E:\Datasets\LJSpeech-1.1\wavs\LJ001-%04d.wav" % (i + 1)
-        wave = load_and_preprocess_wave(fpath)
-        embed = embed_utterance(wave)
+        wav = load_preprocess_waveform(fpath)
+        embed = embed_utterance(wav)
        plot_embedding_as_heatmap(embed, ax)
    plt.show(block=False)
    
    fig, axes = plt.subplots(3, 3)
    for i, ax in enumerate(axes.flatten(), 20):
        fpath = r"E:\Datasets\LibriSpeech\train-other-500\25\123319\25-123319-%04d.flac" % i
-        wave = load_and_preprocess_wave(fpath)
-        embed = embed_utterance(wave)
+        wav = load_preprocess_waveform(fpath)
+        embed = embed_utterance(wav)
        plot_embedding_as_heatmap(embed, ax)
    plt.show()
    

--- a/SV2TTS/encoder/train.py
+++ b/SV2TTS/encoder/train.py
@@ -40,6 +40,7 @@ def train(run_id: str, clean_data_root: Path, models_dir: Path, vis_every: int,
            print('No model \"%s\" found, starting training from scratch.' % run_id)
    else:
        print("Starting the training from scratch.")
+    model.train()
    
    # Initialize the visualization environment
    device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU')

--- a/SV2TTS/requirements.txt
+++ b/SV2TTS/requirements.txt
+webrtcvad
+falcon>=1.2.0
+inflect>=0.2.5
+audioread>=2.1.5
+librosa>=0.5.1
+matplotlib>=2.0.2
+numpy>=1.14.0
+scipy>=1.0.0
+tqdm>=4.11.2
+Unidecode>=0.4.20
+pyaudio>=0.2.11
+sounddevice>=0.3.10
\ No newline at end of file
--- a/tacotron2/datasets/__init__.py
+++ b/tacotron2/datasets/__init__.py
--- a/tacotron2/tacotron/__init__.py
+++ b/tacotron2/tacotron/__init__.py
--- a/SV2TTS/synthesizer/datasets/audio.py
+++ b/SV2TTS/synthesizer/datasets/audio.py
+import librosa
+import librosa.filters
+import numpy as np
+import tensorflow as tf
+from scipy import signal
+from scipy.io import wavfile
+
+
+def load_wav(path, sr):
+    return librosa.core.load(path, sr=sr)[0]
+
+def save_wav(wav, path, sr):
+    wav *= 32767 / max(0.01, np.max(np.abs(wav)))
+    #proposed by @dsmiller
+    wavfile.write(path, sr, wav.astype(np.int16))
+
+def save_wavenet_wav(wav, path, sr):
+    librosa.output.write_wav(path, wav, sr=sr)
+
+def preemphasis(wav, k, preemphasize=True):
+    if preemphasize:
+        return signal.lfilter([1, -k], [1], wav)
+    return wav
+
+def inv_preemphasis(wav, k, inv_preemphasize=True):
+    if inv_preemphasize:
+        return signal.lfilter([1], [1, -k], wav)
+    return wav
+
+#From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py
+def start_and_end_indices(quantized, silence_threshold=2):
+    for start in range(quantized.size):
+        if abs(quantized[start] - 127) > silence_threshold:
+            break
+    for end in range(quantized.size - 1, 1, -1):
+        if abs(quantized[end] - 127) > silence_threshold:
+            break
+    
+    assert abs(quantized[start] - 127) > silence_threshold
+    assert abs(quantized[end] - 127) > silence_threshold
+    
+    return start, end
+
+def get_hop_size(hparams):
+    hop_size = hparams.hop_size
+    if hop_size is None:
+        assert hparams.frame_shift_ms is not None
+        hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
+    return hop_size
+
+def linearspectrogram(wav, hparams):
+    D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
+    S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db
+    
+    if hparams.signal_normalization:
+        return _normalize(S, hparams)
+    return S
+
+def melspectrogram(wav, hparams):
+    D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
+    S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db
+    
+    if hparams.signal_normalization:
+        return _normalize(S, hparams)
+    return S
+
+def inv_linear_spectrogram(linear_spectrogram, hparams):
+    '''Converts linear spectrogram to waveform using librosa'''
+    if hparams.signal_normalization:
+        D = _denormalize(linear_spectrogram, hparams)
+    else:
+        D = linear_spectrogram
+    
+    S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear
+    
+    if hparams.use_lws:
+        processor = _lws_processor(hparams)
+        D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
+        y = processor.istft(D).astype(np.float32)
+        return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
+    else:
+        return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
+
+
+def inv_mel_spectrogram(mel_spectrogram, hparams):
+    '''Converts mel spectrogram to waveform using librosa'''
+    if hparams.signal_normalization:
+        D = _denormalize(mel_spectrogram, hparams)
+    else:
+        D = mel_spectrogram
+    
+    S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams)  # Convert back to linear
+    
+    if hparams.use_lws:
+        processor = _lws_processor(hparams)
+        D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
+        y = processor.istft(D).astype(np.float32)
+        return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
+    else:
+        return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
+
+def _lws_processor(hparams):
+    import lws
+    return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech")
+
+def _griffin_lim(S, hparams):
+    '''librosa implementation of Griffin-Lim
+    Based on https://github.com/librosa/librosa/issues/434
+    '''
+    angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
+    S_complex = np.abs(S).astype(np.complex)
+    y = _istft(S_complex * angles, hparams)
+    for i in range(hparams.griffin_lim_iters):
+        angles = np.exp(1j * np.angle(_stft(y, hparams)))
+        y = _istft(S_complex * angles, hparams)
+    return y
+
+def _stft(y, hparams):
+    if hparams.use_lws:
+        return _lws_processor(hparams).stft(y).T
+    else:
+        return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
+
+def _istft(y, hparams):
+    return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
+
+##########################################################
+#Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
+def num_frames(length, fsize, fshift):
+    """Compute number of time frames of spectrogram
+    """
+    pad = (fsize - fshift)
+    if length % fshift == 0:
+        M = (length + pad * 2 - fsize) // fshift + 1
+    else:
+        M = (length + pad * 2 - fsize) // fshift + 2
+    return M
+
+
+def pad_lr(x, fsize, fshift):
+    """Compute left and right padding
+    """
+    M = num_frames(len(x), fsize, fshift)
+    pad = (fsize - fshift)
+    T = len(x) + 2 * pad
+    r = (M - 1) * fshift + fsize - T
+    return pad, pad + r
+##########################################################
+#Librosa correct padding
+def librosa_pad_lr(x, fsize, fshift):
+    '''compute right padding (final frame)
+    '''
+    return int(fsize // 2)
+
+
+# Conversions
+_mel_basis = None
+_inv_mel_basis = None
+
+def _linear_to_mel(spectogram, hparams):
+    global _mel_basis
+    if _mel_basis is None:
+        _mel_basis = _build_mel_basis(hparams)
+    return np.dot(_mel_basis, spectogram)
+
+def _mel_to_linear(mel_spectrogram, hparams):
+    global _inv_mel_basis
+    if _inv_mel_basis is None:
+        _inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams))
+    return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
+
+def _build_mel_basis(hparams):
+    assert hparams.fmax <= hparams.sample_rate // 2
+    return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels,
+                               fmin=hparams.fmin, fmax=hparams.fmax)
+
+def _amp_to_db(x, hparams):
+    min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
+    return 20 * np.log10(np.maximum(min_level, x))
+
+def _db_to_amp(x):
+    return np.power(10.0, (x) * 0.05)
+
+def _normalize(S, hparams):
+    if hparams.allow_clipping_in_normalization:
+        if hparams.symmetric_mels:
+            return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value,
+                           -hparams.max_abs_value, hparams.max_abs_value)
+        else:
+            return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value)
+    
+    assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
+    if hparams.symmetric_mels:
+        return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value
+    else:
+        return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db))
+
+def _denormalize(D, hparams):
+    if hparams.allow_clipping_in_normalization:
+        if hparams.symmetric_mels:
+            return (((np.clip(D, -hparams.max_abs_value,
+                              hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value))
+                    + hparams.min_level_db)
+        else:
+            return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
+    
+    if hparams.symmetric_mels:
+        return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db)
+    else:
+        return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
--- a/tacotron2/datasets/preprocessor.py
+++ b/tacotron2/datasets/preprocessor.py
 import os
-import logmmse
 from vlibs import fileio
 import numpy as np
 from datasets import audio
@@ -11,7 +10,7 @@ from encoder import inference

 def build_from_path(hparams, input_dirs, mel_dir, embed_dir, wav_dir):
    """
-    Preprocesses the speech dataset from a gven input path to given output directories
+    Preprocesses the speech dataset from a given input path to given output directories

    Args:
        - hparams: hyper parameters
@@ -32,23 +31,17 @@ def build_from_path(hparams, input_dirs, mel_dir, embed_dir, wav_dir):
    for input_dir in input_dirs:
        for speaker_dir in fileio.listdir(input_dir, full_path=True):
            print("    " + speaker_dir)
-            for utterance_dir in fileio.listdir(speaker_dir, full_path=True):
-                alignment_file = fileio.get_files(utterance_dir, '.alignment.txt')[0]
-                for line in fileio.read_all_lines(alignment_file):
-                    # Retrieve the audio filepath and its alignment data
-                    basename, words, end_times = line.strip().split(' ')
-                    words = words.replace('\"', '').split(',')
-                    end_times = [float(e) for e in end_times.replace('\"', '').split(',')]
-                    wav_path = fileio.join(utterance_dir, basename + '.flac')
-
-                    # Split utterances on silences
-                    wavs, texts = _clean_and_split_utterance(wav_path, words, end_times, hparams)
-                    
-                    # Process all parts of the utterance
-                    for i, (wav, text) in enumerate(zip(wavs, texts)):
-                        sub_basename = "%s_%02d" % (basename, i)
-                        data.append(_process_utterance(mel_dir, embed_dir, wav_dir, sub_basename,
-                                                       wav, text, hparams))
+            for book_dir in fileio.listdir(speaker_dir, full_path=True):
+                text_fpaths = fileio.get_files(book_dir, '\.normalized\.txt')
+                wav_fpaths = fileio.get_files(book_dir, '\.wav')
+                assert len(text_fpaths) == len(wav_fpaths)
+                
+                for text_fpath, wav_fpath in zip(text_fpaths, wav_fpaths):
+                    basename = os.path.splitext(fileio.leaf(wav_fpath))[0]
+                    text = fileio.read_all_lines(text_fpath)[0].rstrip()
+                    text = text.lower()
+                    data.append(_process_utterance(mel_dir, embed_dir, wav_dir, basename,
+                                                   wav_fpath, text, hparams))

    n_all_samples = len(data)
    data = [d for d in data if d is not None]
@@ -57,34 +50,7 @@ def build_from_path(hparams, input_dirs, mel_dir, embed_dir, wav_dir):
          (n_all_samples, n_all_samples - n_remaining_samples, n_remaining_samples))
    return data

-def _clean_and_split_utterance(wav_path, words, end_times, hparams):
-    # Load and rescale the audio
-    wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
-    if hparams.rescale:
-        wav = wav / np.abs(wav).max() * hparams.rescaling_max
-        
-    # Suppress the noise
-    wav = logmmse.logmmse(wav, hparams.sample_rate)
-    
-    # Find pauses in the sentence
-    words = np.array(words)
-    start_times = np.array([0.0] + end_times[:-1])
-    end_times = np.array(end_times)
-    assert len(words) == len(end_times) == len(start_times)
-    assert words[0] == '' and words[-1] == ''
-    
-    # Break the sentence on pauses that are too long
-    mask = (words == '') & (end_times - start_times >= hparams.silence_min_duration_split)
-    mask[0] = mask[-1] = True
-    breaks = np.where(mask)[0]
-    segment_times = [[end_times[s], start_times[e]] for s, e in zip(breaks[:-1], breaks[1:])]
-    segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int)
-    wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times]
-    texts = [' '.join(words[s + 1:e]).replace('  ', ' ') for s, e in zip(breaks[:-1], breaks[1:])]
-    
-    return wavs, texts
-
-def _process_utterance(mel_dir, embed_dir, wav_dir, basename, wav, text, hparams):
+def _process_utterance(mel_dir, embed_dir, wav_dir, basename, wav_path, text, hparams):
    """
    Preprocesses a single utterance wav/text pair.

@@ -95,13 +61,17 @@ def _process_utterance(mel_dir, embed_dir, wav_dir, basename, wav, text, hparams
        - embed_dir: the directory to write the embedding into
        - wav_dir: the directory to write the preprocessed wav into
        - basename: the source base filename to use in the spectogram filename
-        - wav: the audio waveform unprocessed
+        - wav_path: the path to the audio waveform
        - text: text spoken in the audio
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, embed_filename, time_steps, mel_frames, text)
-    """      
+    """
+    wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
+    if hparams.rescale:
+        wav = (wav / np.abs(wav).max()) * hparams.rescaling_max
+
    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

--- a/tacotron2/tacotron/feeder.py
+++ b/tacotron2/tacotron/feeder.py
 import os
 import threading
 import time
-import traceback

 import numpy as np
 import tensorflow as tf
-from infolog import log
+from synthesizer.infolog import log
 from sklearn.model_selection import train_test_split
 from tacotron.utils.text import text_to_sequence


--- a/tacotron2/hparams.py
+++ b/tacotron2/hparams.py
@@ -93,10 +93,10 @@ hparams = tf.contrib.training.HParams(
 	# showing black silent regions on top), then restart from step 2.
    num_mels=80,  # Number of mel-spectrogram channels and local conditioning dimensionality
    # num_freq=1025,  # (= n_fft / 2 + 1) only used when adding linear spectrograms post processing
-    num_freq=513,  # (= n_fft / 2 + 1) only used when adding linear spectrograms post processing
+    num_freq=1025,  # (= n_fft / 2 + 1) only used when adding linear spectrograms post processing
    #  network
-    rescale=False,  # Whether to rescale audio prior to preprocessing
-    rescaling_max=0.999,  # Rescaling value
+    rescale=True,  # Whether to rescale audio prior to preprocessing
+    rescaling_max=0.9,  # Rescaling value
    # Whether to clip silence in Audio (at beginning and end of audio only, not the middle)
    # train samples of lengths between 3sec and 14sec are more than enough to make a model capable
    # of good parallelization.
@@ -123,10 +123,10 @@ hparams = tf.contrib.training.HParams(
    # sample_rate=22050,  # 22050 Hz (corresponding to ljspeech dataset) (sox --i <filename>)
    
    # FOR DATASETS IN 16000Hz:
-    n_fft=800,  # Extra window size is filled with 0 paddings to match this parameter
-    hop_size=200,  # For 16000Hz, 200 = 12.5 ms (0.0125 * sample_rate)
-    win_size=800,  # For 16000Hz, 800 = 50 ms (If None, win_size = n_fft) (0.05 * sample_rate)
-    sample_rate=16000,  # 16000Hz (corresponding to librispeech) (sox --i <filename>)
+    n_fft=2048,  # Extra window size is filled with 0 paddings to match this parameter
+    hop_size=300,  # For 24000Hz, 300 = 12.5 ms (0.0125 * sample_rate)
+    win_size=1200,  # For 24000Hz, 1200 = 50 ms (If None, win_size = n_fft) (0.05 * sample_rate)
+    sample_rate=24000,  # 24000Hz (corresponding to libritts) (sox --i <filename>)
    
    frame_shift_ms=None,  # Can replace hop_size parameter. (Recommended: 12.5)
    
@@ -181,7 +181,7 @@ hparams = tf.contrib.training.HParams(
    # Determines whether the decoder should stop when predicting <stop> to any frame or to all of 
    # them (True works pretty well)
    
-    embedding_dim=512,  # dimension of embedding space
+    embedding_dim=512,  # dimension of embedding space (TODO: clarify/delete this)
    
    # Encoder parameters
    enc_conv_num_layers=3,  # number of encoder convolutional layers
@@ -252,7 +252,7 @@ hparams = tf.contrib.training.HParams(
    # major slowdowns! Only use when critical!)
    
    # train/test split ratios, mini-batches sizes
-    tacotron_batch_size=40,  # number of training samples on each training steps (was 32)
+    tacotron_batch_size=36,  # number of training samples on each training steps (was 32)
    # Tacotron Batch synthesis supports ~16x the training batch size (no gradients during 
    # testing). 
    # Training Tacotron with unmasked paddings makes it aware of them, which makes synthesis times

--- a/tacotron2/infolog.py
+++ b/tacotron2/infolog.py
--- a/tacotron2/tacotron/models/__init__.py
+++ b/tacotron2/tacotron/models/__init__.py
--- a/tacotron2/tacotron/models/architecture_wrappers.py
+++ b/tacotron2/tacotron/models/architecture_wrappers.py
--- a/tacotron2/tacotron/models/attention.py
+++ b/tacotron2/tacotron/models/attention.py
--- a/tacotron2/tacotron/models/custom_decoder.py
+++ b/tacotron2/tacotron/models/custom_decoder.py
--- a/tacotron2/tacotron/models/helpers.py
+++ b/tacotron2/tacotron/models/helpers.py
--- a/tacotron2/tacotron/models/modules.py
+++ b/tacotron2/tacotron/models/modules.py
--- a/tacotron2/tacotron/models/tacotron.py
+++ b/tacotron2/tacotron/models/tacotron.py
 import tensorflow as tf
 from tacotron.utils.symbols import symbols
-from infolog import log
+from synthesizer.infolog import log
 from tacotron.models.helpers import TacoTrainingHelper, TacoTestHelper
 from tacotron.models.modules import *
 from tensorflow.contrib.seq2seq import dynamic_decode
-from tacotron.models.Architecture_wrappers import TacotronEncoderCell, TacotronDecoderCell
+from tacotron.models.architecture_wrappers import TacotronEncoderCell, TacotronDecoderCell
 from tacotron.models.custom_decoder import CustomDecoder
 from tacotron.models.attention import LocationSensitiveAttention


--- a/tacotron2/tacotron/synthesize.py
+++ b/tacotron2/tacotron/synthesize.py
-import argparse
 import os
-import re
 import time
 from time import sleep

 import tensorflow as tf
-from hparams import hparams, hparams_debug_string
-from infolog import log
+from synthesizer.hparams import hparams_debug_string
+from synthesizer.infolog import log
 from tacotron.synthesizer import Synthesizer
 from tqdm import tqdm


--- a/tacotron2/tacotron/synthesizer.py
+++ b/tacotron2/tacotron/synthesizer.py
@@ -5,7 +5,7 @@ import numpy as np
 import pyaudio
 import tensorflow as tf
 from datasets import audio
-from infolog import log
+from synthesizer.infolog import log
 from tacotron.models import create_model
 from tacotron.utils import plot
 from tacotron.utils.text import text_to_sequence

--- a/tacotron2/tacotron/train.py
+++ b/tacotron2/tacotron/train.py
-import argparse
 import os
-import subprocess
 import time
 import traceback
 from datetime import datetime

-import infolog
+from synthesizer import infolog
 import numpy as np
 import tensorflow as tf
 from datasets import audio
-from hparams import hparams_debug_string
+from synthesizer.hparams import hparams_debug_string
 from tacotron.feeder import Feeder
 from tacotron.models import create_model
 from tacotron.utils import ValueWindow, plot

--- a/tacotron2/tacotron/utils/__init__.py
+++ b/tacotron2/tacotron/utils/__init__.py
--- a/tacotron2/tacotron/utils/cleaners.py
+++ b/tacotron2/tacotron/utils/cleaners.py
--- a/tacotron2/tacotron/utils/cmudict.py
+++ b/tacotron2/tacotron/utils/cmudict.py
--- a/tacotron2/tacotron/utils/numbers.py
+++ b/tacotron2/tacotron/utils/numbers.py
--- a/tacotron2/tacotron/utils/plot.py
+++ b/tacotron2/tacotron/utils/plot.py
--- a/tacotron2/tacotron/utils/symbols.py
+++ b/tacotron2/tacotron/utils/symbols.py
--- a/tacotron2/tacotron/utils/text.py
+++ b/tacotron2/tacotron/utils/text.py
--- a/SV2TTS/synthesizer_preprocess.py
+++ b/SV2TTS/synthesizer_preprocess.py
+import argparse
+import os
+from multiprocessing import cpu_count
+from datasets import preprocessor
+from synthesizer.hparams import hparams
+from vlibs import fileio
+
+
+def preprocess(args, input_folders, out_dir, hparams):
+    mel_dir = os.path.join(out_dir, 'mels')
+    wav_dir = os.path.join(out_dir, 'audio')
+    embed_dir = os.path.join(out_dir, 'embed')
+    os.makedirs(mel_dir, exist_ok=True)
+    os.makedirs(wav_dir, exist_ok=True)
+    os.makedirs(embed_dir, exist_ok=True)
+    metadata = preprocessor.build_from_path(hparams, input_folders, mel_dir, embed_dir, wav_dir)
+    write_metadata(metadata, out_dir)
+
+def write_metadata(metadata, out_dir):
+    with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
+        for m in metadata:
+            f.write('|'.join([str(x) for x in m]) + '\n')
+    mel_frames = sum([int(m[4]) for m in metadata])
+    timesteps = sum([int(m[3]) for m in metadata])
+    sr = hparams.sample_rate
+    hours = timesteps / sr / 3600
+    print('Write {} utterances, {} mel frames, {} audio timesteps, ({:.2f} hours)'.format(
+        len(metadata), mel_frames, timesteps, hours))
+    print('Max input length (text chars): {}'.format(max(len(m[5]) for m in metadata)))
+    print('Max mel frames length: {}'.format(max(int(m[4]) for m in metadata)))
+    print('Max audio timesteps length: {}'.format(max(m[3] for m in metadata)))
+
+def norm_data(args):
+    print('Selecting data folders..')
+    dataset_dir = fileio.join(args.base_dir, 'LibriTTS')
+    if args.sets is not None:
+        sets = args.sets
+    else:
+        sets = [set for set in fileio.listdir(dataset_dir) if set.startswith('train-clean')]
+    return fileio.join(dataset_dir, sets)
+
+def run_preprocess(args, hparams):
+    input_folders = norm_data(args)
+    output_folder = os.path.join(args.base_dir, args.output)
+    preprocess(args, input_folders, output_folder, hparams)
+
+def main():
+    print('Initializing preprocessing..')
+    parser = argparse.ArgumentParser()
+    
+    # Root data directory that contains the LibriTTS directory
+    parser.add_argument('--base_dir', default='')
+    parser.add_argument('--hparams', default='',
+                        help='Hyperparameter overrides as a comma-separated list of name=value pairs')
+    parser.add_argument('--output', default='Synthesizer')
+    parser.add_argument('--n_jobs', type=int, default=cpu_count())
+    
+    # Name of the LibriTTS sets to use, separated by spaces 
+    # (e.g. "--sets train-other-500 train-clean-360). Defaults to using all the clean training sets 
+    # present in the LibriSpeech directory.
+    parser.add_argument('--sets', type=str, nargs='+', default=None)
+    
+    args = parser.parse_args()
+    
+    modified_hp = hparams.parse(args.hparams)
+    
+    run_preprocess(args, modified_hp)
+
+if __name__ == '__main__':
+    main()
--- a/tacotron2/train.py
+++ b/tacotron2/train.py
 import argparse
 import os
 from time import sleep
-import infolog
+from synthesizer import infolog
 import tensorflow as tf
-from hparams import hparams
-from infolog import log
+from synthesizer.hparams import hparams
+from synthesizer.infolog import log
 from tacotron.synthesize import tacotron_synthesize
 from tacotron.train import tacotron_train


--- a/tacotron2/synthesize.py
+++ b/tacotron2/synthesize.py
 import argparse
 import os
-from warnings import warn
 from time import sleep

 import tensorflow as tf

-from hparams import hparams
-from infolog import log
+from synthesizer.hparams import hparams
+from synthesizer.infolog import log
 from tacotron.synthesize import tacotron_synthesize



--- a/tacotron2/LICENSE
+++ b/tacotron2/LICENSE
-MIT License
-
-Original work Copyright (c) 2018 Rayhane Mama
-Modified work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
--- a/tacotron2/README.md
+++ b/tacotron2/README.md
-# Tacotron-2:
-Tensorflow implementation of DeepMind's Tacotron-2. A deep neural network architecture described in this paper: [Natural TTS synthesis by conditioning Wavenet on MEL spectogram predictions](https://arxiv.org/pdf/1712.05884.pdf)
-
-
-# Repository Structure:
-	Tacotron-2
-	├── datasets
-	├── en_UK		(0)
-	│   └── by_book
-	│       └── female
-	├── en_US		(0)
-	│   └── by_book
-	│       ├── female
-	│       └── male
-	├── LJSpeech-1.1	(0)
-	│   └── wavs
-	├── logs-Tacotron	(2)
-	│   ├── eval_-dir
-	│   │ 	├── plots
-	│ 	│ 	└── wavs
-	│   ├── mel-spectrograms
-	│   ├── plots
-	│   ├── pretrained
-	│   └── wavs
-	├── logs-Wavenet	(4)
-	│   ├── eval-dir
-	│   │ 	├── plots
-	│ 	│ 	└── wavs
-	│   ├── plots
-	│   ├── pretrained
-	│   └── wavs
-	├── papers
-	├── tacotron
-	│   ├── models
-	│   └── utils
-	├── tacotron_output	(3)
-	│   ├── eval
-	│   ├── gta
-	│   ├── logs-eval
-	│   │   ├── plots
-	│   │   └── wavs
-	│   └── natural
-	├── wavenet_output	(5)
-	│   ├── plots
-	│   └── wavs
-	├── training_data	(1)
-	│   ├── audio
-	│   ├── linear
-	│	└── mels
-	└── wavenet_vocoder
-		└── models
-
-
-The previous tree shows the current state of the repository (separate training, one step at a time).
-
- Step **(0)**: Get your dataset, here I have set the examples of **Ljspeech**, **en_US** and **en_UK** (from **M-AILABS**).
- Step **(1)**: Preprocess your data. This will give you the **training_data** folder.
- Step **(2)**: Train your Tacotron model. Yields the **logs-Tacotron** folder.
- Step **(3)**: Synthesize/Evaluate the Tacotron model. Gives the **tacotron_output** folder.
- Step **(4)**: Train your Wavenet model. Yield the **logs-Wavenet** folder.
- Step **(5)**: Synthesize audio using the Wavenet model. Gives the **wavenet_output** folder.
-
-
-Note:
- **Our preprocessing only supports Ljspeech and Ljspeech-like datasets (M-AILABS speech data)!** If running on datasets stored differently, you will probably need to make your own preprocessing script.
- In the previous tree, files **were not represented** and **max depth was set to 3** for simplicity.
- If you run training of both **models at the same time**, repository structure will be different.
-
-# Pretrained model and Samples:
-Pre-trained models and audio samples will be added at a later date. You can however check some primary insights of the model performance (at early stages of training) [here](https://github.com/Rayhane-mamah/Tacotron-2/issues/4#issuecomment-378741465). THIS IS VERY OUTDATED, I WILL UPDATE THIS SOON
-
-# Model Architecture:
-<p align="center">
-  <img src="https://preview.ibb.co/bU8sLS/Tacotron_2_Architecture.png"/>
-</p>
-
-The model described by the authors can be divided in two parts:
- Spectrogram prediction network
- Wavenet vocoder
-
-To have an in-depth exploration of the model architecture, training procedure and preprocessing logic, refer to [our wiki](https://github.com/Rayhane-mamah/Tacotron-2/wiki)
-
-# Current state:
-
-To have an overview of our advance on this project, please refer to [this discussion](https://github.com/Rayhane-mamah/Tacotron-2/issues/4)
-
-since the two parts of the global model are trained separately, we can start by training the feature prediction model to use his predictions later during the wavenet training.
-
-# How to start
-first, you need to have python 3 installed along with [Tensorflow](https://www.tensorflow.org/install/).
-
-next you can install the requirements. If you are an Anaconda user: (else replace **pip** with **pip3** and **python** with **python3**)
-
-> pip install -r requirements.txt
-
-# Dataset:
-We tested the code above on the [ljspeech dataset](https://keithito.com/LJ-Speech-Dataset/), which has almost 24 hours of labeled single actress voice recording. (further info on the dataset are available in the README file when you download it)
-
-We are also running current tests on the [new M-AILABS speech dataset](http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/) which contains more than 700h of speech (more than 80 Gb of data) for more than 10 languages.
-
-After **downloading** the dataset, **extract** the compressed file, and **place the folder inside the cloned repository.**
-
-# Hparams setting:
-Before proceeding, you must pick the hyperparameters that suit best your needs. While it is possible to change the hyper parameters from command line during preprocessing/training, I still recommend making the changes once and for all on the **hparams.py** file directly.
-
-To pick optimal fft parameters, I have made a **griffin_lim_synthesis_tool** notebook that you can use to invert real extracted mel/linear spectrograms and choose how good your preprocessing is. All other options are well explained in the **hparams.py** and have meaningful names so that you can try multiple things with them.
-
-# Preprocessing
-Before running the following steps, please make sure you are inside **Tacotron-2 folder**
-
-> cd Tacotron-2
-
-Preprocessing can then be started using: 
-
-> python preprocess.py
-
-dataset can be chosen using the **--dataset** argument. If using M-AILABS dataset, you need to provide the **language, voice, reader, merge_books and book arguments** for your custom need. Default is **Ljspeech**.
-
-Example M-AILABS:
-
-> python preprocess.py --dataset='M-AILABS' --language='en_US' --voice='female' --reader='mary_ann' --merge_books=False --book='northandsouth'
-
-or if you want to use all books for a single speaker:
-
-> python preprocess.py --dataset='M-AILABS' --language='en_US' --voice='female' --reader='mary_ann' --merge_books=True
-
-This should take no longer than a **few minutes.**
-
-# Training:
-To **train both models** sequentially (one after the other):
-
-> python train.py --model='Tacotron-2'
-
-
-Feature prediction model can **separately** be **trained** using:
-
-> python train.py --model='Tacotron'
-
-checkpoints will be made each **5000 steps** and stored under **logs-Tacotron folder.**
-
-Naturally, **training the wavenet separately** is done by:
-
-> python train.py --model='WaveNet'
-
-logs will be stored inside **logs-Wavenet**.
-
-**Note:**
- If model argument is not provided, training will default to Tacotron-2 model training. (both models)
- Please refer to train arguments under [train.py](https://github.com/Rayhane-mamah/Tacotron-2/blob/master/train.py) for a set of options you can use.
- It is now possible to make wavenet preprocessing alone using **wavenet_proprocess.py**.
-
-# Synthesis
-To **synthesize audio** in an **End-to-End** (text to audio) manner (both models at work):
-
-> python synthesize.py --model='Tacotron-2'
-
-For the spectrogram prediction network (separately), there are **three types** of mel spectrograms synthesis:
-
- **Evaluation** (synthesis on custom sentences). This is what we'll usually use after having a full end to end model.
-
-> python synthesize.py --model='Tacotron' --mode='eval'
-
- **Natural synthesis** (let the model make predictions alone by feeding last decoder output to the next time step).
-
-> python synthesize.py --model='Tacotron' --GTA=False
-
-
- **Ground Truth Aligned synthesis** (DEFAULT: the model is assisted by true labels in a teacher forcing manner). This synthesis method is used when predicting mel spectrograms used to train the wavenet vocoder. (yields better results as stated in the paper)
-
-> python synthesize.py --model='Tacotron' --GTA=True
-
-Synthesizing the **waveforms** conditionned on previously synthesized Mel-spectrograms (separately) can be done with:
-
-> python synthesize.py --model='WaveNet'
-
-**Note:**
- If model argument is not provided, synthesis will default to Tacotron-2 model synthesis. (End-to-End TTS)
- Please refer to synthesis arguments under [synthesize.py](https://github.com/Rayhane-mamah/Tacotron-2/blob/master/synthesize.py) for a set of options you can use.
-
-
-# References and Resources:
- [Natural TTS synthesis by conditioning Wavenet on MEL spectogram predictions](https://arxiv.org/pdf/1712.05884.pdf)
- [Original tacotron paper](https://arxiv.org/pdf/1703.10135.pdf)
- [Attention-Based Models for Speech Recognition](https://arxiv.org/pdf/1506.07503.pdf)
- [Wavenet: A generative model for raw audio](https://arxiv.org/pdf/1609.03499.pdf)
- [Fast Wavenet](https://arxiv.org/pdf/1611.09482.pdf)
- [r9y9/wavenet_vocoder](https://github.com/r9y9/wavenet_vocoder)
- [keithito/tacotron](https://github.com/keithito/tacotron)
-
--- a/tacotron2/datasets/audio.py
+++ b/tacotron2/datasets/audio.py
-import librosa
-import librosa.filters
-import numpy as np
-import tensorflow as tf
-from scipy import signal
-from scipy.io import wavfile
-
-
-def load_wav(path, sr):
-	return librosa.core.load(path, sr=sr)[0]
-
-def save_wav(wav, path, sr):
-	wav *= 32767 / max(0.01, np.max(np.abs(wav)))
-	#proposed by @dsmiller
-	wavfile.write(path, sr, wav.astype(np.int16))
-
-def save_wavenet_wav(wav, path, sr):
-	librosa.output.write_wav(path, wav, sr=sr)
-
-def preemphasis(wav, k, preemphasize=True):
-	if preemphasize:
-		return signal.lfilter([1, -k], [1], wav)
-	return wav
-
-def inv_preemphasis(wav, k, inv_preemphasize=True):
-	if inv_preemphasize:
-		return signal.lfilter([1], [1, -k], wav)
-	return wav
-
-#From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py
-def start_and_end_indices(quantized, silence_threshold=2):
-	for start in range(quantized.size):
-		if abs(quantized[start] - 127) > silence_threshold:
-			break
-	for end in range(quantized.size - 1, 1, -1):
-		if abs(quantized[end] - 127) > silence_threshold:
-			break
-
-	assert abs(quantized[start] - 127) > silence_threshold
-	assert abs(quantized[end] - 127) > silence_threshold
-
-	return start, end
-
-def get_hop_size(hparams):
-	hop_size = hparams.hop_size
-	if hop_size is None:
-		assert hparams.frame_shift_ms is not None
-		hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
-	return hop_size
-
-def linearspectrogram(wav, hparams):
-	D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
-	S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db
-
-	if hparams.signal_normalization:
-		return _normalize(S, hparams)
-	return S
-
-def melspectrogram(wav, hparams):
-	D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
-	S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db
-
-	if hparams.signal_normalization:
-		return _normalize(S, hparams)
-	return S
-
-def inv_linear_spectrogram(linear_spectrogram, hparams):
-	'''Converts linear spectrogram to waveform using librosa'''
-	if hparams.signal_normalization:
-		D = _denormalize(linear_spectrogram, hparams)
-	else:
-		D = linear_spectrogram
-
-	S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear
-
-	if hparams.use_lws:
-		processor = _lws_processor(hparams)
-		D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
-		y = processor.istft(D).astype(np.float32)
-		return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
-	else:
-		return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
-
-
-def inv_mel_spectrogram(mel_spectrogram, hparams):
-	'''Converts mel spectrogram to waveform using librosa'''
-	if hparams.signal_normalization:
-		D = _denormalize(mel_spectrogram, hparams)
-	else:
-		D = mel_spectrogram
-
-	S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams)  # Convert back to linear
-
-	if hparams.use_lws:
-		processor = _lws_processor(hparams)
-		D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
-		y = processor.istft(D).astype(np.float32)
-		return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
-	else:
-		return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
-
-def _lws_processor(hparams):
-	import lws
-	return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech")
-
-def _griffin_lim(S, hparams):
-	'''librosa implementation of Griffin-Lim
-	Based on https://github.com/librosa/librosa/issues/434
-	'''
-	angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
-	S_complex = np.abs(S).astype(np.complex)
-	y = _istft(S_complex * angles, hparams)
-	for i in range(hparams.griffin_lim_iters):
-		angles = np.exp(1j * np.angle(_stft(y, hparams)))
-		y = _istft(S_complex * angles, hparams)
-	return y
-
-def _stft(y, hparams):
-	if hparams.use_lws:
-		return _lws_processor(hparams).stft(y).T
-	else:
-		return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
-
-def _istft(y, hparams):
-	return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
-
-##########################################################
-#Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
-def num_frames(length, fsize, fshift):
-	"""Compute number of time frames of spectrogram
-	"""
-	pad = (fsize - fshift)
-	if length % fshift == 0:
-		M = (length + pad * 2 - fsize) // fshift + 1
-	else:
-		M = (length + pad * 2 - fsize) // fshift + 2
-	return M
-
-
-def pad_lr(x, fsize, fshift):
-	"""Compute left and right padding
-	"""
-	M = num_frames(len(x), fsize, fshift)
-	pad = (fsize - fshift)
-	T = len(x) + 2 * pad
-	r = (M - 1) * fshift + fsize - T
-	return pad, pad + r
-##########################################################
-#Librosa correct padding
-def librosa_pad_lr(x, fsize, fshift):
-	'''compute right padding (final frame)
-	'''
-	return int(fsize // 2)
-
-
-# Conversions
-_mel_basis = None
-_inv_mel_basis = None
-
-def _linear_to_mel(spectogram, hparams):
-	global _mel_basis
-	if _mel_basis is None:
-		_mel_basis = _build_mel_basis(hparams)
-	return np.dot(_mel_basis, spectogram)
-
-def _mel_to_linear(mel_spectrogram, hparams):
-	global _inv_mel_basis
-	if _inv_mel_basis is None:
-		_inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams))
-	return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
-
-def _build_mel_basis(hparams):
-	assert hparams.fmax <= hparams.sample_rate // 2
-	return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels,
-							   fmin=hparams.fmin, fmax=hparams.fmax)
-
-def _amp_to_db(x, hparams):
-	min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
-	return 20 * np.log10(np.maximum(min_level, x))
-
-def _db_to_amp(x):
-	return np.power(10.0, (x) * 0.05)
-
-def _normalize(S, hparams):
-	if hparams.allow_clipping_in_normalization:
-		if hparams.symmetric_mels:
-			return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value,
-			 -hparams.max_abs_value, hparams.max_abs_value)
-		else:
-			return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value)
-
-	assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
-	if hparams.symmetric_mels:
-		return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value
-	else:
-		return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db))
-
-def _denormalize(D, hparams):
-	if hparams.allow_clipping_in_normalization:
-		if hparams.symmetric_mels:
-			return (((np.clip(D, -hparams.max_abs_value,
-				hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value))
-				+ hparams.min_level_db)
-		else:
-			return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
-
-	if hparams.symmetric_mels:
-		return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db)
-	else:
-		return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
--- a/tacotron2/griffin_lim_synthesis_tool.py
+++ b/tacotron2/griffin_lim_synthesis_tool.py
-
-import numpy as np
-from datasets.audio import *
-import os
-from hparams import hparams
-import sounddevice
-
-n_sample = 0
-mel_folder = 'logs-Tacotron/mel-spectrograms'
-mel_file = 'mel-prediction-step-{}.npy'.format(n_sample)
-out_dir = 'wav_out'
-
-os.makedirs(out_dir, exist_ok=True)
-
-#mel_file = os.path.join(mel_folder, mel_file)
-
-from vlibs import fileio
-
-# fnames = fileio.listdir('logs-two_outputs/mel-spectrograms/')
-fnames = fileio.listdir('tacotron_output/eval/')
-for i in range(1, len(fnames)):
-    # mel_file = 'logs-two_outputs/mel-spectrograms/mel-prediction-step-110000.npy'
-    mel_file = fileio.join('tacotron_output/eval/', fnames[i])
-    mel_spectro = np.load(mel_file) #.transpose()
-    wav = inv_mel_spectrogram(mel_spectro.T, hparams) 
-    sounddevice.wait()
-    print(fnames[i])
-    sounddevice.play(wav, 16000)
-sounddevice.wait()
-quit()
-
-save_wav(wav, os.path.join(out_dir, 'test_mel_{}.wav'.format(mel_file.replace('/', '_').replace('\\', '_').replace('.npy', ''))),
-        sr=hparams.sample_rate)
-
-
-# In[3]:
-
-
-from tacotron.utils.plot import *
-
-plot_spectrogram(mel_spectro, path=os.path.join(out_dir, 'test_mel_{}.png'.format(mel_file.replace('/', '_').replace('\\', '_').replace('.npy', ''))))
-
-
-# In[4]:
-
-
-lin_file = 'training_data/linear/linear-LJ001-0005.npy'
-lin_spectro = np.load(lin_file)
-lin_spectro.shape
-
-
-# In[5]:
-
-
-wav = inv_linear_spectrogram(lin_spectro.T, hparams)
-save_wav(wav, os.path.join(out_dir, 'test_linear_{}.wav'.format(mel_file.replace('/', '_').replace('\\', '_').replace('.npy', ''))),
-        sr=hparams.sample_rate)
-
-
-# In[6]:
-
-
-plot_spectrogram(lin_spectro, path=os.path.join(out_dir, 'test_linear_{}.png'.format(mel_file.replace('/', '_').replace('\\', '_').replace('.npy', ''))),
-                auto_aspect=True)
-
--- a/tacotron2/preprocess.py
+++ b/tacotron2/preprocess.py
-import argparse
-import os
-from multiprocessing import cpu_count
-from datasets import preprocessor
-from hparams import hparams
-from vlibs import fileio
-
-
-def preprocess(args, input_folders, out_dir, hparams):
-	mel_dir = os.path.join(out_dir, 'mels')
-	wav_dir = os.path.join(out_dir, 'audio')
-	embed_dir = os.path.join(out_dir, 'embed')
-	os.makedirs(mel_dir, exist_ok=True)
-	os.makedirs(wav_dir, exist_ok=True)
-	os.makedirs(embed_dir, exist_ok=True)
-	metadata = preprocessor.build_from_path(hparams, input_folders, mel_dir, embed_dir, wav_dir)
-	write_metadata(metadata, out_dir)
-
-def write_metadata(metadata, out_dir):
-	with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
-		for m in metadata:
-			f.write('|'.join([str(x) for x in m]) + '\n')
-	mel_frames = sum([int(m[4]) for m in metadata])
-	timesteps = sum([int(m[3]) for m in metadata])
-	sr = hparams.sample_rate
-	hours = timesteps / sr / 3600
-	print('Write {} utterances, {} mel frames, {} audio timesteps, ({:.2f} hours)'.format(
-		len(metadata), mel_frames, timesteps, hours))
-	print('Max input length (text chars): {}'.format(max(len(m[5]) for m in metadata)))
-	print('Max mel frames length: {}'.format(max(int(m[4]) for m in metadata)))
-	print('Max audio timesteps length: {}'.format(max(m[3] for m in metadata)))
-
-def norm_data(args):
-	print('Selecting data folders..')
-	dataset_dir = fileio.join(args.base_dir, 'LibriSpeech')
-	if args.sets is not None:
-		sets = args.sets
-	else:
-		sets = [set for set in fileio.listdir(dataset_dir) if set.startswith('train-clean')]
-	return fileio.join(dataset_dir, sets)
-
-def run_preprocess(args, hparams):
-	input_folders = norm_data(args)
-	output_folder = os.path.join(args.base_dir, args.output)
-	preprocess(args, input_folders, output_folder, hparams)
-
-def main():
-	print('Initializing preprocessing..')
-	parser = argparse.ArgumentParser()
-	
-	# Root data directory that contains the LibriSpeech directory
-	parser.add_argument('--base_dir', default='')
-	parser.add_argument('--hparams', default='',
-		help='Hyperparameter overrides as a comma-separated list of name=value pairs')
-	parser.add_argument('--output', default='Synthesizer')
-	parser.add_argument('--n_jobs', type=int, default=cpu_count())
-	
-	# Name of the LibriSpeech sets to use, separated by spaces 
-	# (e.g. "--sets train-other-500 train-clean-360). Defaults to using all the clean training sets 
-	# present in the LibriSpeech directory.
-	parser.add_argument('--sets', type=str, nargs='+', default=None)
-	
-	args = parser.parse_args()
-
-	modified_hp = hparams.parse(args.hparams)
-
-	run_preprocess(args, modified_hp)
-
-if __name__ == '__main__':
-	main()
--- a/tacotron2/prune_train_txt.py
+++ b/tacotron2/prune_train_txt.py
-from vlibs import fileio
-import numpy as np
-from hparams import hparams
-
-root = '/home/cjemine/data/Synthesizer2/'
-
-# On the remote, Synthesizer2 has max 900 frames.
-# Try with 600, if it works, increase to 700.
-
-lines = fileio.read_all_lines(fileio.join(root, "train.txt"))
-out = []
-pruned = 0
-intact = 0
-for line in lines:
-    line = line.rstrip()
-    audio_fname, mel_fname, embed_fname, *_ = line.split('|')
-    mel = np.load(fileio.join(root, "mels", mel_fname))
-    if len(mel) > hparams.max_mel_frames:
-        pruned += 1
-    else:
-        intact += 1
-        out.append(line)
-        if intact % 100 == 0:
-            print("Kept: %d / Discarded: %d   (%.1f%% discarded)" % 
-                  (intact, pruned, (pruned / (intact + pruned)) * 100))
-out.append('')        
-fileio.write_all_lines(fileio.join(root, "train_max_frames_%d.txt" % hparams.max_mel_frames), out)
-print("Kept: %d / Discarded: %d   (%.1f%% discarded)" %
-      (intact, pruned, (pruned / (intact + pruned)) * 100))
\ No newline at end of file
--- a/tacotron2/requirements.txt
+++ b/tacotron2/requirements.txt
-logmmse
-webrtcvad
-falcon==1.2.0
-inflect==0.2.5
-audioread==2.1.5
-librosa==0.5.1
-matplotlib==2.0.2
-numpy==1.14.0
-scipy==1.0.0
-tqdm==4.11.2
-Unidecode==0.4.20
-pyaudio==0.2.11
-sounddevice==0.3.10
-lws
-keras
\ No newline at end of file
--- a/tacotron2/sentences.txt
+++ b/tacotron2/sentences.txt
-Scientists at the CERN laboratory say they have discovered a new particle.
-There's a way to measure the acute emotional intelligence that has never gone out of style.
-President Trump met with other leaders at the Group of 20 conference.
-The Senate's bill to repeal and replace the Affordable Care Act is now imperiled.
-Generative adversarial network or variational auto-encoder.
-Basilar membrane and otolaryngology are not auto-correlations.
-He has read the whole thing.
-He reads books.
-He thought it was time to present the present.
-Thisss isrealy awhsome.
-Punctuation sensitivity, is working.
-Punctuation sensitivity is working.
-Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?
-She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure.
-Tajima Airport serves Toyooka.
-Sequence to sequence models have enjoyed great success in a variety of tasks such as machine translation, speech recognition, and text summarization. This project covers a sequence to sequence model trained to predict a speech representation from an input sequence of characters. We show that the adopted architecture is able to perform this task with wild success.
-Thank you so much for your support!
\ No newline at end of file