提交 9a2f1ddd 编写于 作者: C Corentin Jemine

Moved tacotron2 under SV2TTS

上级 6190c6e9
......@@ -14,5 +14,5 @@
*.bcf
_old
SV2TTS/encoder/saved_models/*_backups
tacotron2/logs-*
SV2TTS/synthesizer/saved_models/*
wave-rnn/model_outputs
\ No newline at end of file
MIT License
Original work Copyright (c) 2018 Rayhane Mama
Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah)
Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
......
from time import perf_counter
from encoder import inference
from config import *
from encoder.params_data import sampling_rate
from pathlib import Path
import torch
if __name__ == '__main__':
fpath = r"E:\Datasets\LibriSpeech\train-other-500\149\125760\149-125760-0003.flac"
wave = inference.load_and_preprocess_wave(fpath)
wav = inference.load_preprocess_waveform(fpath)
models_dir = Path("encoder/saved_models")
model_fpath = models_dir.joinpath("all.pt")
torch.cuda.synchronize()
model_fpath = fileio.join(model_dir, "all.pt")
start = perf_counter()
inference.load_model(model_fpath, device)
inference.load_model(model_fpath)
print("Loaded model in %.2fs" % (perf_counter() - start))
torch.cuda.synchronize()
duration = len(wave) / sampling_rate
duration = len(wav) / sampling_rate
start = perf_counter()
embed = inference.embed_utterance(wave)
embed = inference.embed_utterance(wav)
torch.cuda.synchronize()
print("Processed %.2fs long utterance in %.2fs" % (duration, perf_counter() - start))
start = perf_counter()
embed = inference.embed_utterance(wave)
embed = inference.embed_utterance(wav)
torch.cuda.synchronize()
print("Processed %.2fs long utterance in %.2fs" % (duration, perf_counter() - start))
print(embed)
from datasets.audio import inv_mel_spectrogram
from tacotron import synthesizer
from hparams import hparams
from synthesizer.hparams import hparams
from vlibs import fileio
import sounddevice as sd
import tensorflow as tf
......
......@@ -211,7 +211,7 @@ class UMapDemoUI(QDialog):
self.dataset_box.currentText(),
self.speaker_box.currentText(),
self.utterance_box.currentText())
self.utterance = inference.load_and_preprocess_wave(fpath)
self.utterance = inference.load_preprocess_waveform(fpath)
self.is_record = False
def embed_utterance(self, demo, speaker_name=None, go_next=None):
......
......@@ -9,17 +9,20 @@ from matplotlib import cm
_model = None # type: SpeakerEncoder
_device = None # type: torch.device
def load_model(weights_fpath, device):
def load_model(weights_fpath, device=None):
"""
Loads the model in memory. If this function is not explicitely called, it will be run on the
first call to embed_frames() with the default weights file.
:param weights_fpath: the path to saved model weights.
:param device: either a torch device or the name of a torch device (e.g. 'cpu', 'cuda'). The
model will be loaded and will run on this device. Outputs will however always be on the cpu.
model will be loaded and will run on this device. Outputs will however always be on the cpu.
If None, will default to your GPU if it's available, otherwise your CPU.
"""
global _model, _device
if isinstance(device, str):
if device is None:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
elif isinstance(device, str):
device = torch.device(device)
_device = device
_model = SpeakerEncoder(_device)
......@@ -76,28 +79,28 @@ def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_fram
frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
# Compute the slices
wave_slices, mel_slices = [], []
wav_slices, mel_slices = [], []
steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
for i in range(0, steps, frame_step):
mel_range = np.array([i, i + partial_utterance_n_frames])
wave_range = mel_range * samples_per_frame
wav_range = mel_range * samples_per_frame
mel_slices.append(slice(*mel_range))
wave_slices.append(slice(*wave_range))
wav_slices.append(slice(*wav_range))
# Evaluate whether extra padding is warranted or not
last_wave_range = wave_slices[-1]
coverage = (n_samples - last_wave_range.start) / (last_wave_range.stop - last_wave_range.start)
last_wav_range = wav_slices[-1]
coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
if coverage < min_pad_coverage and len(mel_slices) > 1:
mel_slices = mel_slices[:-1]
wave_slices = wave_slices[:-1]
wav_slices = wav_slices[:-1]
return wave_slices, mel_slices
return wav_slices, mel_slices
def embed_utterance(wave, using_partials=True, return_partials=False, **kwargs):
def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
"""
Computes an embedding for a single utterance.
:param wave: the utterance waveform as a numpy array of float32
:param wav: the utterance waveform as a numpy array of float32
:param using_partials: if True, then the utterance is split in partial utterances of
<partial_utterance_n_frames> frames and the utterance embedding is computed from their
normalized average. If False, the utterance is instead computed from feeding the entire
......@@ -113,20 +116,20 @@ def embed_utterance(wave, using_partials=True, return_partials=False, **kwargs):
"""
# Process the entire utterance if not using partials
if not using_partials:
frames = audio.wave_to_mel_filterbank(wave)
frames = audio.wave_to_mel_filterbank(wav)
embed = embed_frames_batch(frames[None, ...])[0]
if return_partials:
return embed, None, None
return embed
# Compute where to split the utterance into partials and pad if necessary
wave_slices, mel_slices = compute_partial_slices(len(wave), **kwargs)
wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
max_wave_length = wave_slices[-1].stop
if max_wave_length >= len(wave):
wave = np.pad(wave, (0, max_wave_length - len(wave)), 'constant')
if max_wave_length >= len(wav):
wav = np.pad(wav, (0, max_wave_length - len(wav)), 'constant')
# Split the utterance into partials
frames = audio.wave_to_mel_filterbank(wave)
frames = audio.wave_to_mel_filterbank(wav)
frames_batch = np.array([frames[s] for s in mel_slices])
partial_embeds = embed_frames_batch(frames_batch)
......@@ -137,14 +140,11 @@ def embed_utterance(wave, using_partials=True, return_partials=False, **kwargs):
if return_partials:
return embed, partial_embeds, wave_slices
return embed
def embed_stream(stream, partial_utterance_n_frames=partials_n_frames, overlap=0.5):
raise NotImplemented()
def embed_speaker(waves, normalize=False, **kwargs):
def embed_speaker(wavs, normalize=False, **kwargs):
raise NotImplemented()
def load_and_preprocess_wave(fpath):
def load_preprocess_waveform(fpath):
"""
Loads an audio file in memory and applies the same preprocessing operations used in trained
the Speaker Encoder. Using this function is not mandatory but recommended.
......@@ -178,16 +178,16 @@ if __name__ == '__main__':
fig, axes = plt.subplots(3, 3)
for i, ax in enumerate(axes.flatten(), 50):
fpath = r"E:\Datasets\LJSpeech-1.1\wavs\LJ001-%04d.wav" % (i + 1)
wave = load_and_preprocess_wave(fpath)
embed = embed_utterance(wave)
wav = load_preprocess_waveform(fpath)
embed = embed_utterance(wav)
plot_embedding_as_heatmap(embed, ax)
plt.show(block=False)
fig, axes = plt.subplots(3, 3)
for i, ax in enumerate(axes.flatten(), 20):
fpath = r"E:\Datasets\LibriSpeech\train-other-500\25\123319\25-123319-%04d.flac" % i
wave = load_and_preprocess_wave(fpath)
embed = embed_utterance(wave)
wav = load_preprocess_waveform(fpath)
embed = embed_utterance(wav)
plot_embedding_as_heatmap(embed, ax)
plt.show()
......
......@@ -40,6 +40,7 @@ def train(run_id: str, clean_data_root: Path, models_dir: Path, vis_every: int,
print('No model \"%s\" found, starting training from scratch.' % run_id)
else:
print("Starting the training from scratch.")
model.train()
# Initialize the visualization environment
device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU')
......
webrtcvad
falcon>=1.2.0
inflect>=0.2.5
audioread>=2.1.5
librosa>=0.5.1
matplotlib>=2.0.2
numpy>=1.14.0
scipy>=1.0.0
tqdm>=4.11.2
Unidecode>=0.4.20
pyaudio>=0.2.11
sounddevice>=0.3.10
\ No newline at end of file
import librosa
import librosa.filters
import numpy as np
import tensorflow as tf
from scipy import signal
from scipy.io import wavfile
def load_wav(path, sr):
return librosa.core.load(path, sr=sr)[0]
def save_wav(wav, path, sr):
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
#proposed by @dsmiller
wavfile.write(path, sr, wav.astype(np.int16))
def save_wavenet_wav(wav, path, sr):
librosa.output.write_wav(path, wav, sr=sr)
def preemphasis(wav, k, preemphasize=True):
if preemphasize:
return signal.lfilter([1, -k], [1], wav)
return wav
def inv_preemphasis(wav, k, inv_preemphasize=True):
if inv_preemphasize:
return signal.lfilter([1], [1, -k], wav)
return wav
#From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py
def start_and_end_indices(quantized, silence_threshold=2):
for start in range(quantized.size):
if abs(quantized[start] - 127) > silence_threshold:
break
for end in range(quantized.size - 1, 1, -1):
if abs(quantized[end] - 127) > silence_threshold:
break
assert abs(quantized[start] - 127) > silence_threshold
assert abs(quantized[end] - 127) > silence_threshold
return start, end
def get_hop_size(hparams):
hop_size = hparams.hop_size
if hop_size is None:
assert hparams.frame_shift_ms is not None
hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
return hop_size
def linearspectrogram(wav, hparams):
D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db
if hparams.signal_normalization:
return _normalize(S, hparams)
return S
def melspectrogram(wav, hparams):
D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db
if hparams.signal_normalization:
return _normalize(S, hparams)
return S
def inv_linear_spectrogram(linear_spectrogram, hparams):
'''Converts linear spectrogram to waveform using librosa'''
if hparams.signal_normalization:
D = _denormalize(linear_spectrogram, hparams)
else:
D = linear_spectrogram
S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear
if hparams.use_lws:
processor = _lws_processor(hparams)
D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
y = processor.istft(D).astype(np.float32)
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
else:
return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
def inv_mel_spectrogram(mel_spectrogram, hparams):
'''Converts mel spectrogram to waveform using librosa'''
if hparams.signal_normalization:
D = _denormalize(mel_spectrogram, hparams)
else:
D = mel_spectrogram
S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams) # Convert back to linear
if hparams.use_lws:
processor = _lws_processor(hparams)
D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
y = processor.istft(D).astype(np.float32)
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
else:
return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
def _lws_processor(hparams):
import lws
return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech")
def _griffin_lim(S, hparams):
'''librosa implementation of Griffin-Lim
Based on https://github.com/librosa/librosa/issues/434
'''
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex)
y = _istft(S_complex * angles, hparams)
for i in range(hparams.griffin_lim_iters):
angles = np.exp(1j * np.angle(_stft(y, hparams)))
y = _istft(S_complex * angles, hparams)
return y
def _stft(y, hparams):
if hparams.use_lws:
return _lws_processor(hparams).stft(y).T
else:
return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
def _istft(y, hparams):
return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
##########################################################
#Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
def num_frames(length, fsize, fshift):
"""Compute number of time frames of spectrogram
"""
pad = (fsize - fshift)
if length % fshift == 0:
M = (length + pad * 2 - fsize) // fshift + 1
else:
M = (length + pad * 2 - fsize) // fshift + 2
return M
def pad_lr(x, fsize, fshift):
"""Compute left and right padding
"""
M = num_frames(len(x), fsize, fshift)
pad = (fsize - fshift)
T = len(x) + 2 * pad
r = (M - 1) * fshift + fsize - T
return pad, pad + r
##########################################################
#Librosa correct padding
def librosa_pad_lr(x, fsize, fshift):
'''compute right padding (final frame)
'''
return int(fsize // 2)
# Conversions
_mel_basis = None
_inv_mel_basis = None
def _linear_to_mel(spectogram, hparams):
global _mel_basis
if _mel_basis is None:
_mel_basis = _build_mel_basis(hparams)
return np.dot(_mel_basis, spectogram)
def _mel_to_linear(mel_spectrogram, hparams):
global _inv_mel_basis
if _inv_mel_basis is None:
_inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams))
return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
def _build_mel_basis(hparams):
assert hparams.fmax <= hparams.sample_rate // 2
return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels,
fmin=hparams.fmin, fmax=hparams.fmax)
def _amp_to_db(x, hparams):
min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
return 20 * np.log10(np.maximum(min_level, x))
def _db_to_amp(x):
return np.power(10.0, (x) * 0.05)
def _normalize(S, hparams):
if hparams.allow_clipping_in_normalization:
if hparams.symmetric_mels:
return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value,
-hparams.max_abs_value, hparams.max_abs_value)
else:
return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value)
assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
if hparams.symmetric_mels:
return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value
else:
return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db))
def _denormalize(D, hparams):
if hparams.allow_clipping_in_normalization:
if hparams.symmetric_mels:
return (((np.clip(D, -hparams.max_abs_value,
hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value))
+ hparams.min_level_db)
else:
return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
if hparams.symmetric_mels:
return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db)
else:
return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
import os
import logmmse
from vlibs import fileio
import numpy as np
from datasets import audio
......@@ -11,7 +10,7 @@ from encoder import inference
def build_from_path(hparams, input_dirs, mel_dir, embed_dir, wav_dir):
"""
Preprocesses the speech dataset from a gven input path to given output directories
Preprocesses the speech dataset from a given input path to given output directories
Args:
- hparams: hyper parameters
......@@ -32,23 +31,17 @@ def build_from_path(hparams, input_dirs, mel_dir, embed_dir, wav_dir):
for input_dir in input_dirs:
for speaker_dir in fileio.listdir(input_dir, full_path=True):
print(" " + speaker_dir)
for utterance_dir in fileio.listdir(speaker_dir, full_path=True):
alignment_file = fileio.get_files(utterance_dir, '.alignment.txt')[0]
for line in fileio.read_all_lines(alignment_file):
# Retrieve the audio filepath and its alignment data
basename, words, end_times = line.strip().split(' ')
words = words.replace('\"', '').split(',')
end_times = [float(e) for e in end_times.replace('\"', '').split(',')]
wav_path = fileio.join(utterance_dir, basename + '.flac')
# Split utterances on silences
wavs, texts = _clean_and_split_utterance(wav_path, words, end_times, hparams)
# Process all parts of the utterance
for i, (wav, text) in enumerate(zip(wavs, texts)):
sub_basename = "%s_%02d" % (basename, i)
data.append(_process_utterance(mel_dir, embed_dir, wav_dir, sub_basename,
wav, text, hparams))
for book_dir in fileio.listdir(speaker_dir, full_path=True):
text_fpaths = fileio.get_files(book_dir, '\.normalized\.txt')
wav_fpaths = fileio.get_files(book_dir, '\.wav')
assert len(text_fpaths) == len(wav_fpaths)
for text_fpath, wav_fpath in zip(text_fpaths, wav_fpaths):
basename = os.path.splitext(fileio.leaf(wav_fpath))[0]
text = fileio.read_all_lines(text_fpath)[0].rstrip()
text = text.lower()
data.append(_process_utterance(mel_dir, embed_dir, wav_dir, basename,
wav_fpath, text, hparams))
n_all_samples = len(data)
data = [d for d in data if d is not None]
......@@ -57,34 +50,7 @@ def build_from_path(hparams, input_dirs, mel_dir, embed_dir, wav_dir):
(n_all_samples, n_all_samples - n_remaining_samples, n_remaining_samples))
return data
def _clean_and_split_utterance(wav_path, words, end_times, hparams):
# Load and rescale the audio
wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# Suppress the noise
wav = logmmse.logmmse(wav, hparams.sample_rate)
# Find pauses in the sentence
words = np.array(words)
start_times = np.array([0.0] + end_times[:-1])
end_times = np.array(end_times)
assert len(words) == len(end_times) == len(start_times)
assert words[0] == '' and words[-1] == ''
# Break the sentence on pauses that are too long
mask = (words == '') & (end_times - start_times >= hparams.silence_min_duration_split)
mask[0] = mask[-1] = True
breaks = np.where(mask)[0]
segment_times = [[end_times[s], start_times[e]] for s, e in zip(breaks[:-1], breaks[1:])]
segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int)
wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times]
texts = [' '.join(words[s + 1:e]).replace(' ', ' ') for s, e in zip(breaks[:-1], breaks[1:])]
return wavs, texts
def _process_utterance(mel_dir, embed_dir, wav_dir, basename, wav, text, hparams):
def _process_utterance(mel_dir, embed_dir, wav_dir, basename, wav_path, text, hparams):
"""
Preprocesses a single utterance wav/text pair.
......@@ -95,13 +61,17 @@ def _process_utterance(mel_dir, embed_dir, wav_dir, basename, wav, text, hparams
- embed_dir: the directory to write the embedding into
- wav_dir: the directory to write the preprocessed wav into
- basename: the source base filename to use in the spectogram filename
- wav: the audio waveform unprocessed
- wav_path: the path to the audio waveform
- text: text spoken in the audio
- hparams: hyper parameters
Returns:
- A tuple: (audio_filename, mel_filename, embed_filename, time_steps, mel_frames, text)
"""
"""
wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
if hparams.rescale:
wav = (wav / np.abs(wav).max()) * hparams.rescaling_max
# Compute the mel scale spectrogram from the wav
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
mel_frames = mel_spectrogram.shape[1]
......
import os
import threading
import time
import traceback
import numpy as np
import tensorflow as tf
from infolog import log
from synthesizer.infolog import log
from sklearn.model_selection import train_test_split
from tacotron.utils.text import text_to_sequence
......
......@@ -93,10 +93,10 @@ hparams = tf.contrib.training.HParams(
# showing black silent regions on top), then restart from step 2.
num_mels=80, # Number of mel-spectrogram channels and local conditioning dimensionality
# num_freq=1025, # (= n_fft / 2 + 1) only used when adding linear spectrograms post processing
num_freq=513, # (= n_fft / 2 + 1) only used when adding linear spectrograms post processing
num_freq=1025, # (= n_fft / 2 + 1) only used when adding linear spectrograms post processing
# network
rescale=False, # Whether to rescale audio prior to preprocessing
rescaling_max=0.999, # Rescaling value
rescale=True, # Whether to rescale audio prior to preprocessing
rescaling_max=0.9, # Rescaling value
# Whether to clip silence in Audio (at beginning and end of audio only, not the middle)
# train samples of lengths between 3sec and 14sec are more than enough to make a model capable
# of good parallelization.
......@@ -123,10 +123,10 @@ hparams = tf.contrib.training.HParams(
# sample_rate=22050, # 22050 Hz (corresponding to ljspeech dataset) (sox --i <filename>)
# FOR DATASETS IN 16000Hz:
n_fft=800, # Extra window size is filled with 0 paddings to match this parameter
hop_size=200, # For 16000Hz, 200 = 12.5 ms (0.0125 * sample_rate)
win_size=800, # For 16000Hz, 800 = 50 ms (If None, win_size = n_fft) (0.05 * sample_rate)
sample_rate=16000, # 16000Hz (corresponding to librispeech) (sox --i <filename>)
n_fft=2048, # Extra window size is filled with 0 paddings to match this parameter
hop_size=300, # For 24000Hz, 300 = 12.5 ms (0.0125 * sample_rate)
win_size=1200, # For 24000Hz, 1200 = 50 ms (If None, win_size = n_fft) (0.05 * sample_rate)
sample_rate=24000, # 24000Hz (corresponding to libritts) (sox --i <filename>)
frame_shift_ms=None, # Can replace hop_size parameter. (Recommended: 12.5)
......@@ -181,7 +181,7 @@ hparams = tf.contrib.training.HParams(
# Determines whether the decoder should stop when predicting <stop> to any frame or to all of
# them (True works pretty well)
embedding_dim=512, # dimension of embedding space
embedding_dim=512, # dimension of embedding space (TODO: clarify/delete this)
# Encoder parameters
enc_conv_num_layers=3, # number of encoder convolutional layers
......@@ -252,7 +252,7 @@ hparams = tf.contrib.training.HParams(
# major slowdowns! Only use when critical!)
# train/test split ratios, mini-batches sizes
tacotron_batch_size=40, # number of training samples on each training steps (was 32)
tacotron_batch_size=36, # number of training samples on each training steps (was 32)
# Tacotron Batch synthesis supports ~16x the training batch size (no gradients during
# testing).
# Training Tacotron with unmasked paddings makes it aware of them, which makes synthesis times
......
import tensorflow as tf
from tacotron.utils.symbols import symbols
from infolog import log
from synthesizer.infolog import log
from tacotron.models.helpers import TacoTrainingHelper, TacoTestHelper
from tacotron.models.modules import *
from tensorflow.contrib.seq2seq import dynamic_decode
from tacotron.models.Architecture_wrappers import TacotronEncoderCell, TacotronDecoderCell
from tacotron.models.architecture_wrappers import TacotronEncoderCell, TacotronDecoderCell
from tacotron.models.custom_decoder import CustomDecoder
from tacotron.models.attention import LocationSensitiveAttention
......
import argparse
import os
import re
import time
from time import sleep
import tensorflow as tf
from hparams import hparams, hparams_debug_string
from infolog import log
from synthesizer.hparams import hparams_debug_string
from synthesizer.infolog import log
from tacotron.synthesizer import Synthesizer
from tqdm import tqdm
......
......@@ -5,7 +5,7 @@ import numpy as np
import pyaudio
import tensorflow as tf
from datasets import audio
from infolog import log
from synthesizer.infolog import log
from tacotron.models import create_model
from tacotron.utils import plot
from tacotron.utils.text import text_to_sequence
......
import argparse
import os
import subprocess
import time
import traceback
from datetime import datetime
import infolog
from synthesizer import infolog
import numpy as np
import tensorflow as tf
from datasets import audio
from hparams import hparams_debug_string
from synthesizer.hparams import hparams_debug_string
from tacotron.feeder import Feeder
from tacotron.models import create_model
from tacotron.utils import ValueWindow, plot
......
import argparse
import os
from multiprocessing import cpu_count
from datasets import preprocessor
from synthesizer.hparams import hparams
from vlibs import fileio
def preprocess(args, input_folders, out_dir, hparams):
mel_dir = os.path.join(out_dir, 'mels')
wav_dir = os.path.join(out_dir, 'audio')
embed_dir = os.path.join(out_dir, 'embed')
os.makedirs(mel_dir, exist_ok=True)
os.makedirs(wav_dir, exist_ok=True)
os.makedirs(embed_dir, exist_ok=True)
metadata = preprocessor.build_from_path(hparams, input_folders, mel_dir, embed_dir, wav_dir)
write_metadata(metadata, out_dir)
def write_metadata(metadata, out_dir):
with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
for m in metadata:
f.write('|'.join([str(x) for x in m]) + '\n')
mel_frames = sum([int(m[4]) for m in metadata])
timesteps = sum([int(m[3]) for m in metadata])
sr = hparams.sample_rate
hours = timesteps / sr / 3600
print('Write {} utterances, {} mel frames, {} audio timesteps, ({:.2f} hours)'.format(
len(metadata), mel_frames, timesteps, hours))
print('Max input length (text chars): {}'.format(max(len(m[5]) for m in metadata)))
print('Max mel frames length: {}'.format(max(int(m[4]) for m in metadata)))
print('Max audio timesteps length: {}'.format(max(m[3] for m in metadata)))
def norm_data(args):
print('Selecting data folders..')
dataset_dir = fileio.join(args.base_dir, 'LibriTTS')
if args.sets is not None:
sets = args.sets
else:
sets = [set for set in fileio.listdir(dataset_dir) if set.startswith('train-clean')]
return fileio.join(dataset_dir, sets)
def run_preprocess(args, hparams):
input_folders = norm_data(args)
output_folder = os.path.join(args.base_dir, args.output)
preprocess(args, input_folders, output_folder, hparams)
def main():
print('Initializing preprocessing..')
parser = argparse.ArgumentParser()
# Root data directory that contains the LibriTTS directory
parser.add_argument('--base_dir', default='')
parser.add_argument('--hparams', default='',
help='Hyperparameter overrides as a comma-separated list of name=value pairs')
parser.add_argument('--output', default='Synthesizer')
parser.add_argument('--n_jobs', type=int, default=cpu_count())
# Name of the LibriTTS sets to use, separated by spaces
# (e.g. "--sets train-other-500 train-clean-360). Defaults to using all the clean training sets
# present in the LibriSpeech directory.
parser.add_argument('--sets', type=str, nargs='+', default=None)
args = parser.parse_args()
modified_hp = hparams.parse(args.hparams)
run_preprocess(args, modified_hp)
if __name__ == '__main__':
main()
import argparse
import os
from time import sleep
import infolog
from synthesizer import infolog
import tensorflow as tf
from hparams import hparams
from infolog import log
from synthesizer.hparams import hparams
from synthesizer.infolog import log
from tacotron.synthesize import tacotron_synthesize
from tacotron.train import tacotron_train
......
import argparse
import os
from warnings import warn
from time import sleep
import tensorflow as tf
from hparams import hparams
from infolog import log
from synthesizer.hparams import hparams
from synthesizer.infolog import log
from tacotron.synthesize import tacotron_synthesize
......
MIT License
Original work Copyright (c) 2018 Rayhane Mama
Modified work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
# Tacotron-2:
Tensorflow implementation of DeepMind's Tacotron-2. A deep neural network architecture described in this paper: [Natural TTS synthesis by conditioning Wavenet on MEL spectogram predictions](https://arxiv.org/pdf/1712.05884.pdf)
# Repository Structure:
Tacotron-2
├── datasets
├── en_UK (0)
│   └── by_book
│   └── female
├── en_US (0)
│   └── by_book
│   ├── female
│   └── male
├── LJSpeech-1.1 (0)
│   └── wavs
├── logs-Tacotron (2)
│   ├── eval_-dir
│   │  ├── plots
│  │  └── wavs
│   ├── mel-spectrograms
│   ├── plots
│   ├── pretrained
│   └── wavs
├── logs-Wavenet (4)
│   ├── eval-dir
│   │  ├── plots
│  │  └── wavs
│   ├── plots
│   ├── pretrained
│   └── wavs
├── papers
├── tacotron
│   ├── models
│   └── utils
├── tacotron_output (3)
│   ├── eval
│   ├── gta
│   ├── logs-eval
│   │   ├── plots
│   │   └── wavs
│   └── natural
├── wavenet_output (5)
│   ├── plots
│   └── wavs
├── training_data (1)
│   ├── audio
│   ├── linear
│ └── mels
└── wavenet_vocoder
└── models
The previous tree shows the current state of the repository (separate training, one step at a time).
- Step **(0)**: Get your dataset, here I have set the examples of **Ljspeech**, **en_US** and **en_UK** (from **M-AILABS**).
- Step **(1)**: Preprocess your data. This will give you the **training_data** folder.
- Step **(2)**: Train your Tacotron model. Yields the **logs-Tacotron** folder.
- Step **(3)**: Synthesize/Evaluate the Tacotron model. Gives the **tacotron_output** folder.
- Step **(4)**: Train your Wavenet model. Yield the **logs-Wavenet** folder.
- Step **(5)**: Synthesize audio using the Wavenet model. Gives the **wavenet_output** folder.
Note:
- **Our preprocessing only supports Ljspeech and Ljspeech-like datasets (M-AILABS speech data)!** If running on datasets stored differently, you will probably need to make your own preprocessing script.
- In the previous tree, files **were not represented** and **max depth was set to 3** for simplicity.
- If you run training of both **models at the same time**, repository structure will be different.
# Pretrained model and Samples:
Pre-trained models and audio samples will be added at a later date. You can however check some primary insights of the model performance (at early stages of training) [here](https://github.com/Rayhane-mamah/Tacotron-2/issues/4#issuecomment-378741465). THIS IS VERY OUTDATED, I WILL UPDATE THIS SOON
# Model Architecture:
<p align="center">
<img src="https://preview.ibb.co/bU8sLS/Tacotron_2_Architecture.png"/>
</p>
The model described by the authors can be divided in two parts:
- Spectrogram prediction network
- Wavenet vocoder
To have an in-depth exploration of the model architecture, training procedure and preprocessing logic, refer to [our wiki](https://github.com/Rayhane-mamah/Tacotron-2/wiki)
# Current state:
To have an overview of our advance on this project, please refer to [this discussion](https://github.com/Rayhane-mamah/Tacotron-2/issues/4)
since the two parts of the global model are trained separately, we can start by training the feature prediction model to use his predictions later during the wavenet training.
# How to start
first, you need to have python 3 installed along with [Tensorflow](https://www.tensorflow.org/install/).
next you can install the requirements. If you are an Anaconda user: (else replace **pip** with **pip3** and **python** with **python3**)
> pip install -r requirements.txt
# Dataset:
We tested the code above on the [ljspeech dataset](https://keithito.com/LJ-Speech-Dataset/), which has almost 24 hours of labeled single actress voice recording. (further info on the dataset are available in the README file when you download it)
We are also running current tests on the [new M-AILABS speech dataset](http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/) which contains more than 700h of speech (more than 80 Gb of data) for more than 10 languages.
After **downloading** the dataset, **extract** the compressed file, and **place the folder inside the cloned repository.**
# Hparams setting:
Before proceeding, you must pick the hyperparameters that suit best your needs. While it is possible to change the hyper parameters from command line during preprocessing/training, I still recommend making the changes once and for all on the **hparams.py** file directly.
To pick optimal fft parameters, I have made a **griffin_lim_synthesis_tool** notebook that you can use to invert real extracted mel/linear spectrograms and choose how good your preprocessing is. All other options are well explained in the **hparams.py** and have meaningful names so that you can try multiple things with them.
# Preprocessing
Before running the following steps, please make sure you are inside **Tacotron-2 folder**
> cd Tacotron-2
Preprocessing can then be started using:
> python preprocess.py
dataset can be chosen using the **--dataset** argument. If using M-AILABS dataset, you need to provide the **language, voice, reader, merge_books and book arguments** for your custom need. Default is **Ljspeech**.
Example M-AILABS:
> python preprocess.py --dataset='M-AILABS' --language='en_US' --voice='female' --reader='mary_ann' --merge_books=False --book='northandsouth'
or if you want to use all books for a single speaker:
> python preprocess.py --dataset='M-AILABS' --language='en_US' --voice='female' --reader='mary_ann' --merge_books=True
This should take no longer than a **few minutes.**
# Training:
To **train both models** sequentially (one after the other):
> python train.py --model='Tacotron-2'
Feature prediction model can **separately** be **trained** using:
> python train.py --model='Tacotron'
checkpoints will be made each **5000 steps** and stored under **logs-Tacotron folder.**
Naturally, **training the wavenet separately** is done by:
> python train.py --model='WaveNet'
logs will be stored inside **logs-Wavenet**.
**Note:**
- If model argument is not provided, training will default to Tacotron-2 model training. (both models)
- Please refer to train arguments under [train.py](https://github.com/Rayhane-mamah/Tacotron-2/blob/master/train.py) for a set of options you can use.
- It is now possible to make wavenet preprocessing alone using **wavenet_proprocess.py**.
# Synthesis
To **synthesize audio** in an **End-to-End** (text to audio) manner (both models at work):
> python synthesize.py --model='Tacotron-2'
For the spectrogram prediction network (separately), there are **three types** of mel spectrograms synthesis:
- **Evaluation** (synthesis on custom sentences). This is what we'll usually use after having a full end to end model.
> python synthesize.py --model='Tacotron' --mode='eval'
- **Natural synthesis** (let the model make predictions alone by feeding last decoder output to the next time step).
> python synthesize.py --model='Tacotron' --GTA=False
- **Ground Truth Aligned synthesis** (DEFAULT: the model is assisted by true labels in a teacher forcing manner). This synthesis method is used when predicting mel spectrograms used to train the wavenet vocoder. (yields better results as stated in the paper)
> python synthesize.py --model='Tacotron' --GTA=True
Synthesizing the **waveforms** conditionned on previously synthesized Mel-spectrograms (separately) can be done with:
> python synthesize.py --model='WaveNet'
**Note:**
- If model argument is not provided, synthesis will default to Tacotron-2 model synthesis. (End-to-End TTS)
- Please refer to synthesis arguments under [synthesize.py](https://github.com/Rayhane-mamah/Tacotron-2/blob/master/synthesize.py) for a set of options you can use.
# References and Resources:
- [Natural TTS synthesis by conditioning Wavenet on MEL spectogram predictions](https://arxiv.org/pdf/1712.05884.pdf)
- [Original tacotron paper](https://arxiv.org/pdf/1703.10135.pdf)
- [Attention-Based Models for Speech Recognition](https://arxiv.org/pdf/1506.07503.pdf)
- [Wavenet: A generative model for raw audio](https://arxiv.org/pdf/1609.03499.pdf)
- [Fast Wavenet](https://arxiv.org/pdf/1611.09482.pdf)
- [r9y9/wavenet_vocoder](https://github.com/r9y9/wavenet_vocoder)
- [keithito/tacotron](https://github.com/keithito/tacotron)
import librosa
import librosa.filters
import numpy as np
import tensorflow as tf
from scipy import signal
from scipy.io import wavfile
def load_wav(path, sr):
return librosa.core.load(path, sr=sr)[0]
def save_wav(wav, path, sr):
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
#proposed by @dsmiller
wavfile.write(path, sr, wav.astype(np.int16))
def save_wavenet_wav(wav, path, sr):
librosa.output.write_wav(path, wav, sr=sr)
def preemphasis(wav, k, preemphasize=True):
if preemphasize:
return signal.lfilter([1, -k], [1], wav)
return wav
def inv_preemphasis(wav, k, inv_preemphasize=True):
if inv_preemphasize:
return signal.lfilter([1], [1, -k], wav)
return wav
#From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py
def start_and_end_indices(quantized, silence_threshold=2):
for start in range(quantized.size):
if abs(quantized[start] - 127) > silence_threshold:
break
for end in range(quantized.size - 1, 1, -1):
if abs(quantized[end] - 127) > silence_threshold:
break
assert abs(quantized[start] - 127) > silence_threshold
assert abs(quantized[end] - 127) > silence_threshold
return start, end
def get_hop_size(hparams):
hop_size = hparams.hop_size
if hop_size is None:
assert hparams.frame_shift_ms is not None
hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
return hop_size
def linearspectrogram(wav, hparams):
D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db
if hparams.signal_normalization:
return _normalize(S, hparams)
return S
def melspectrogram(wav, hparams):
D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db
if hparams.signal_normalization:
return _normalize(S, hparams)
return S
def inv_linear_spectrogram(linear_spectrogram, hparams):
'''Converts linear spectrogram to waveform using librosa'''
if hparams.signal_normalization:
D = _denormalize(linear_spectrogram, hparams)
else:
D = linear_spectrogram
S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear
if hparams.use_lws:
processor = _lws_processor(hparams)
D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
y = processor.istft(D).astype(np.float32)
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
else:
return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
def inv_mel_spectrogram(mel_spectrogram, hparams):
'''Converts mel spectrogram to waveform using librosa'''
if hparams.signal_normalization:
D = _denormalize(mel_spectrogram, hparams)
else:
D = mel_spectrogram
S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams) # Convert back to linear
if hparams.use_lws:
processor = _lws_processor(hparams)
D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
y = processor.istft(D).astype(np.float32)
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
else:
return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
def _lws_processor(hparams):
import lws
return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech")
def _griffin_lim(S, hparams):
'''librosa implementation of Griffin-Lim
Based on https://github.com/librosa/librosa/issues/434
'''
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex)
y = _istft(S_complex * angles, hparams)
for i in range(hparams.griffin_lim_iters):
angles = np.exp(1j * np.angle(_stft(y, hparams)))
y = _istft(S_complex * angles, hparams)
return y
def _stft(y, hparams):
if hparams.use_lws:
return _lws_processor(hparams).stft(y).T
else:
return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
def _istft(y, hparams):
return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
##########################################################
#Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
def num_frames(length, fsize, fshift):
"""Compute number of time frames of spectrogram
"""
pad = (fsize - fshift)
if length % fshift == 0:
M = (length + pad * 2 - fsize) // fshift + 1
else:
M = (length + pad * 2 - fsize) // fshift + 2
return M
def pad_lr(x, fsize, fshift):
"""Compute left and right padding
"""
M = num_frames(len(x), fsize, fshift)
pad = (fsize - fshift)
T = len(x) + 2 * pad
r = (M - 1) * fshift + fsize - T
return pad, pad + r
##########################################################
#Librosa correct padding
def librosa_pad_lr(x, fsize, fshift):
'''compute right padding (final frame)
'''
return int(fsize // 2)
# Conversions
_mel_basis = None
_inv_mel_basis = None
def _linear_to_mel(spectogram, hparams):
global _mel_basis
if _mel_basis is None:
_mel_basis = _build_mel_basis(hparams)
return np.dot(_mel_basis, spectogram)
def _mel_to_linear(mel_spectrogram, hparams):
global _inv_mel_basis
if _inv_mel_basis is None:
_inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams))
return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
def _build_mel_basis(hparams):
assert hparams.fmax <= hparams.sample_rate // 2
return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels,
fmin=hparams.fmin, fmax=hparams.fmax)
def _amp_to_db(x, hparams):
min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
return 20 * np.log10(np.maximum(min_level, x))
def _db_to_amp(x):
return np.power(10.0, (x) * 0.05)
def _normalize(S, hparams):
if hparams.allow_clipping_in_normalization:
if hparams.symmetric_mels:
return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value,
-hparams.max_abs_value, hparams.max_abs_value)
else:
return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value)
assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
if hparams.symmetric_mels:
return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value
else:
return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db))
def _denormalize(D, hparams):
if hparams.allow_clipping_in_normalization:
if hparams.symmetric_mels:
return (((np.clip(D, -hparams.max_abs_value,
hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value))
+ hparams.min_level_db)
else:
return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
if hparams.symmetric_mels:
return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db)
else:
return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
import numpy as np
from datasets.audio import *
import os
from hparams import hparams
import sounddevice
n_sample = 0
mel_folder = 'logs-Tacotron/mel-spectrograms'
mel_file = 'mel-prediction-step-{}.npy'.format(n_sample)
out_dir = 'wav_out'
os.makedirs(out_dir, exist_ok=True)
#mel_file = os.path.join(mel_folder, mel_file)
from vlibs import fileio
# fnames = fileio.listdir('logs-two_outputs/mel-spectrograms/')
fnames = fileio.listdir('tacotron_output/eval/')
for i in range(1, len(fnames)):
# mel_file = 'logs-two_outputs/mel-spectrograms/mel-prediction-step-110000.npy'
mel_file = fileio.join('tacotron_output/eval/', fnames[i])
mel_spectro = np.load(mel_file) #.transpose()
wav = inv_mel_spectrogram(mel_spectro.T, hparams)
sounddevice.wait()
print(fnames[i])
sounddevice.play(wav, 16000)
sounddevice.wait()
quit()
save_wav(wav, os.path.join(out_dir, 'test_mel_{}.wav'.format(mel_file.replace('/', '_').replace('\\', '_').replace('.npy', ''))),
sr=hparams.sample_rate)
# In[3]:
from tacotron.utils.plot import *
plot_spectrogram(mel_spectro, path=os.path.join(out_dir, 'test_mel_{}.png'.format(mel_file.replace('/', '_').replace('\\', '_').replace('.npy', ''))))
# In[4]:
lin_file = 'training_data/linear/linear-LJ001-0005.npy'
lin_spectro = np.load(lin_file)
lin_spectro.shape
# In[5]:
wav = inv_linear_spectrogram(lin_spectro.T, hparams)
save_wav(wav, os.path.join(out_dir, 'test_linear_{}.wav'.format(mel_file.replace('/', '_').replace('\\', '_').replace('.npy', ''))),
sr=hparams.sample_rate)
# In[6]:
plot_spectrogram(lin_spectro, path=os.path.join(out_dir, 'test_linear_{}.png'.format(mel_file.replace('/', '_').replace('\\', '_').replace('.npy', ''))),
auto_aspect=True)
import argparse
import os
from multiprocessing import cpu_count
from datasets import preprocessor
from hparams import hparams
from vlibs import fileio
def preprocess(args, input_folders, out_dir, hparams):
mel_dir = os.path.join(out_dir, 'mels')
wav_dir = os.path.join(out_dir, 'audio')
embed_dir = os.path.join(out_dir, 'embed')
os.makedirs(mel_dir, exist_ok=True)
os.makedirs(wav_dir, exist_ok=True)
os.makedirs(embed_dir, exist_ok=True)
metadata = preprocessor.build_from_path(hparams, input_folders, mel_dir, embed_dir, wav_dir)
write_metadata(metadata, out_dir)
def write_metadata(metadata, out_dir):
with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
for m in metadata:
f.write('|'.join([str(x) for x in m]) + '\n')
mel_frames = sum([int(m[4]) for m in metadata])
timesteps = sum([int(m[3]) for m in metadata])
sr = hparams.sample_rate
hours = timesteps / sr / 3600
print('Write {} utterances, {} mel frames, {} audio timesteps, ({:.2f} hours)'.format(
len(metadata), mel_frames, timesteps, hours))
print('Max input length (text chars): {}'.format(max(len(m[5]) for m in metadata)))
print('Max mel frames length: {}'.format(max(int(m[4]) for m in metadata)))
print('Max audio timesteps length: {}'.format(max(m[3] for m in metadata)))
def norm_data(args):
print('Selecting data folders..')
dataset_dir = fileio.join(args.base_dir, 'LibriSpeech')
if args.sets is not None:
sets = args.sets
else:
sets = [set for set in fileio.listdir(dataset_dir) if set.startswith('train-clean')]
return fileio.join(dataset_dir, sets)
def run_preprocess(args, hparams):
input_folders = norm_data(args)
output_folder = os.path.join(args.base_dir, args.output)
preprocess(args, input_folders, output_folder, hparams)
def main():
print('Initializing preprocessing..')
parser = argparse.ArgumentParser()
# Root data directory that contains the LibriSpeech directory
parser.add_argument('--base_dir', default='')
parser.add_argument('--hparams', default='',
help='Hyperparameter overrides as a comma-separated list of name=value pairs')
parser.add_argument('--output', default='Synthesizer')
parser.add_argument('--n_jobs', type=int, default=cpu_count())
# Name of the LibriSpeech sets to use, separated by spaces
# (e.g. "--sets train-other-500 train-clean-360). Defaults to using all the clean training sets
# present in the LibriSpeech directory.
parser.add_argument('--sets', type=str, nargs='+', default=None)
args = parser.parse_args()
modified_hp = hparams.parse(args.hparams)
run_preprocess(args, modified_hp)
if __name__ == '__main__':
main()
from vlibs import fileio
import numpy as np
from hparams import hparams
root = '/home/cjemine/data/Synthesizer2/'
# On the remote, Synthesizer2 has max 900 frames.
# Try with 600, if it works, increase to 700.
lines = fileio.read_all_lines(fileio.join(root, "train.txt"))
out = []
pruned = 0
intact = 0
for line in lines:
line = line.rstrip()
audio_fname, mel_fname, embed_fname, *_ = line.split('|')
mel = np.load(fileio.join(root, "mels", mel_fname))
if len(mel) > hparams.max_mel_frames:
pruned += 1
else:
intact += 1
out.append(line)
if intact % 100 == 0:
print("Kept: %d / Discarded: %d (%.1f%% discarded)" %
(intact, pruned, (pruned / (intact + pruned)) * 100))
out.append('')
fileio.write_all_lines(fileio.join(root, "train_max_frames_%d.txt" % hparams.max_mel_frames), out)
print("Kept: %d / Discarded: %d (%.1f%% discarded)" %
(intact, pruned, (pruned / (intact + pruned)) * 100))
\ No newline at end of file
logmmse
webrtcvad
falcon==1.2.0
inflect==0.2.5
audioread==2.1.5
librosa==0.5.1
matplotlib==2.0.2
numpy==1.14.0
scipy==1.0.0
tqdm==4.11.2
Unidecode==0.4.20
pyaudio==0.2.11
sounddevice==0.3.10
lws
keras
\ No newline at end of file
Scientists at the CERN laboratory say they have discovered a new particle.
There's a way to measure the acute emotional intelligence that has never gone out of style.
President Trump met with other leaders at the Group of 20 conference.
The Senate's bill to repeal and replace the Affordable Care Act is now imperiled.
Generative adversarial network or variational auto-encoder.
Basilar membrane and otolaryngology are not auto-correlations.
He has read the whole thing.
He reads books.
He thought it was time to present the present.
Thisss isrealy awhsome.
Punctuation sensitivity, is working.
Punctuation sensitivity is working.
Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?
She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure.
Tajima Airport serves Toyooka.
Sequence to sequence models have enjoyed great success in a variety of tasks such as machine translation, speech recognition, and text summarization. This project covers a sequence to sequence model trained to predict a speech representation from an input sequence of characters. We show that the adopted architecture is able to perform this task with wild success.
Thank you so much for your support!
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册