Add files via upload

This commit is contained in:
Sam Khoze
2024-06-18 19:41:56 -07:00
committed by GitHub
parent 8580e27fbd
commit 7d608044ef
36 changed files with 14511 additions and 0 deletions
View File
+17
View File
@@ -0,0 +1,17 @@
import importlib
import os
from inspect import isclass
# import all files under configs/
# configs_dir = os.path.dirname(__file__)
# for file in os.listdir(configs_dir):
# path = os.path.join(configs_dir, file)
# if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)):
# config_name = file[: file.find(".py")] if file.endswith(".py") else file
# module = importlib.import_module("TTS.tts.configs." + config_name)
# for attribute_name in dir(module):
# attribute = getattr(module, attribute_name)
# if isclass(attribute):
# # Add the class to this package's variables
# globals()[attribute_name] = attribute
+107
View File
@@ -0,0 +1,107 @@
from dataclasses import dataclass, field
from typing import List
from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.models.align_tts import AlignTTSArgs
@dataclass
class AlignTTSConfig(BaseTTSConfig):
"""Defines parameters for AlignTTS model.
Example:
>>> from TTS.tts.configs.align_tts_config import AlignTTSConfig
>>> config = AlignTTSConfig()
Args:
model(str):
Model name used for selecting the right model at initialization. Defaults to `align_tts`.
positional_encoding (bool):
enable / disable positional encoding applied to the encoder output. Defaults to True.
hidden_channels (int):
Base number of hidden channels. Defines all the layers expect ones defined by the specific encoder or decoder
parameters. Defaults to 256.
hidden_channels_dp (int):
Number of hidden channels of the duration predictor's layers. Defaults to 256.
encoder_type (str):
Type of the encoder used by the model. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
Defaults to `fftransformer`.
encoder_params (dict):
Parameters used to define the encoder network. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
Defaults to `{"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}`.
decoder_type (str):
Type of the decoder used by the model. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
Defaults to `fftransformer`.
decoder_params (dict):
Parameters used to define the decoder network. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
Defaults to `{"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}`.
phase_start_steps (List[int]):
A list of number of steps required to start the next training phase. AlignTTS has 4 different training
phases. Thus you need to define 4 different values to enable phase based training. If None, it
trains the whole model together. Defaults to None.
ssim_alpha (float):
Weight for the SSIM loss. If set <= 0, disables the SSIM loss. Defaults to 1.0.
duration_loss_alpha (float):
Weight for the duration predictor's loss. Defaults to 1.0.
mdn_alpha (float):
Weight for the MDN loss. Defaults to 1.0.
spec_loss_alpha (float):
Weight for the MSE spectrogram loss. If set <= 0, disables the L1 loss. Defaults to 1.0.
use_speaker_embedding (bool):
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
in the multi-speaker mode. Defaults to False.
use_d_vector_file (bool):
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
d_vector_file (str):
Path to the file including pre-computed speaker embeddings. Defaults to None.
noam_schedule (bool):
enable / disable the use of Noam LR scheduler. Defaults to False.
warmup_steps (int):
Number of warm-up steps for the Noam scheduler. Defaults 4000.
lr (float):
Initial learning rate. Defaults to `1e-3`.
wd (float):
Weight decay coefficient. Defaults to `1e-7`.
min_seq_len (int):
Minimum input sequence length to be used at training.
max_seq_len (int):
Maximum input sequence length to be used at training. Larger values result in more VRAM usage."""
model: str = "align_tts"
# model specific params
model_args: AlignTTSArgs = field(default_factory=AlignTTSArgs)
phase_start_steps: List[int] = None
ssim_alpha: float = 1.0
spec_loss_alpha: float = 1.0
dur_loss_alpha: float = 1.0
mdn_alpha: float = 1.0
# multi-speaker settings
use_speaker_embedding: bool = False
use_d_vector_file: bool = False
d_vector_file: str = False
# optimizer parameters
optimizer: str = "Adam"
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
lr_scheduler: str = None
lr_scheduler_params: dict = None
lr: float = 1e-4
grad_clip: float = 5.0
# overrides
min_seq_len: int = 13
max_seq_len: int = 200
r: int = 1
# testing
test_sentences: List[str] = field(
default_factory=lambda: [
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
"Be a voice, not an echo.",
"I'm sorry Dave. I'm afraid I can't do that.",
"This cake is great. It's so delicious and moist.",
"Prior to November 22, 1963.",
]
)
+105
View File
@@ -0,0 +1,105 @@
import os
from dataclasses import dataclass, field
from typing import Dict
from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.layers.bark.model import GPTConfig
from TTS.tts.layers.bark.model_fine import FineGPTConfig
from TTS.tts.models.bark import BarkAudioConfig
from TTS.utils.generic_utils import get_user_data_dir
@dataclass
class BarkConfig(BaseTTSConfig):
"""Bark TTS configuration
Args:
model (str): model name that registers the model.
audio (BarkAudioConfig): audio configuration. Defaults to BarkAudioConfig().
num_chars (int): number of characters in the alphabet. Defaults to 0.
semantic_config (GPTConfig): semantic configuration. Defaults to GPTConfig().
fine_config (FineGPTConfig): fine configuration. Defaults to FineGPTConfig().
coarse_config (GPTConfig): coarse configuration. Defaults to GPTConfig().
CONTEXT_WINDOW_SIZE (int): GPT context window size. Defaults to 1024.
SEMANTIC_RATE_HZ (float): semantic tokens rate in Hz. Defaults to 49.9.
SEMANTIC_VOCAB_SIZE (int): semantic vocabulary size. Defaults to 10_000.
CODEBOOK_SIZE (int): encodec codebook size. Defaults to 1024.
N_COARSE_CODEBOOKS (int): number of coarse codebooks. Defaults to 2.
N_FINE_CODEBOOKS (int): number of fine codebooks. Defaults to 8.
COARSE_RATE_HZ (int): coarse tokens rate in Hz. Defaults to 75.
SAMPLE_RATE (int): sample rate. Defaults to 24_000.
USE_SMALLER_MODELS (bool): use smaller models. Defaults to False.
TEXT_ENCODING_OFFSET (int): text encoding offset. Defaults to 10_048.
SEMANTIC_PAD_TOKEN (int): semantic pad token. Defaults to 10_000.
TEXT_PAD_TOKEN ([type]): text pad token. Defaults to 10_048.
TEXT_EOS_TOKEN ([type]): text end of sentence token. Defaults to 10_049.
TEXT_SOS_TOKEN ([type]): text start of sentence token. Defaults to 10_050.
SEMANTIC_INFER_TOKEN (int): semantic infer token. Defaults to 10_051.
COARSE_SEMANTIC_PAD_TOKEN (int): coarse semantic pad token. Defaults to 12_048.
COARSE_INFER_TOKEN (int): coarse infer token. Defaults to 12_050.
REMOTE_BASE_URL ([type]): remote base url. Defaults to "https://huggingface.co/erogol/bark/tree".
REMOTE_MODEL_PATHS (Dict): remote model paths. Defaults to None.
LOCAL_MODEL_PATHS (Dict): local model paths. Defaults to None.
SMALL_REMOTE_MODEL_PATHS (Dict): small remote model paths. Defaults to None.
CACHE_DIR (str): local cache directory. Defaults to get_user_data_dir().
DEF_SPEAKER_DIR (str): default speaker directory to stoke speaker values for voice cloning. Defaults to get_user_data_dir().
"""
model: str = "bark"
audio: BarkAudioConfig = field(default_factory=BarkAudioConfig)
num_chars: int = 0
semantic_config: GPTConfig = field(default_factory=GPTConfig)
fine_config: FineGPTConfig = field(default_factory=FineGPTConfig)
coarse_config: GPTConfig = field(default_factory=GPTConfig)
CONTEXT_WINDOW_SIZE: int = 1024
SEMANTIC_RATE_HZ: float = 49.9
SEMANTIC_VOCAB_SIZE: int = 10_000
CODEBOOK_SIZE: int = 1024
N_COARSE_CODEBOOKS: int = 2
N_FINE_CODEBOOKS: int = 8
COARSE_RATE_HZ: int = 75
SAMPLE_RATE: int = 24_000
USE_SMALLER_MODELS: bool = False
TEXT_ENCODING_OFFSET: int = 10_048
SEMANTIC_PAD_TOKEN: int = 10_000
TEXT_PAD_TOKEN: int = 129_595
SEMANTIC_INFER_TOKEN: int = 129_599
COARSE_SEMANTIC_PAD_TOKEN: int = 12_048
COARSE_INFER_TOKEN: int = 12_050
REMOTE_BASE_URL = "https://huggingface.co/erogol/bark/tree/main/"
REMOTE_MODEL_PATHS: Dict = None
LOCAL_MODEL_PATHS: Dict = None
SMALL_REMOTE_MODEL_PATHS: Dict = None
CACHE_DIR: str = str(get_user_data_dir("tts/suno/bark_v0"))
DEF_SPEAKER_DIR: str = str(get_user_data_dir("tts/bark_v0/speakers"))
def __post_init__(self):
self.REMOTE_MODEL_PATHS = {
"text": {
"path": os.path.join(self.REMOTE_BASE_URL, "text_2.pt"),
"checksum": "54afa89d65e318d4f5f80e8e8799026a",
},
"coarse": {
"path": os.path.join(self.REMOTE_BASE_URL, "coarse_2.pt"),
"checksum": "8a98094e5e3a255a5c9c0ab7efe8fd28",
},
"fine": {
"path": os.path.join(self.REMOTE_BASE_URL, "fine_2.pt"),
"checksum": "59d184ed44e3650774a2f0503a48a97b",
},
}
self.LOCAL_MODEL_PATHS = {
"text": os.path.join(self.CACHE_DIR, "text_2.pt"),
"coarse": os.path.join(self.CACHE_DIR, "coarse_2.pt"),
"fine": os.path.join(self.CACHE_DIR, "fine_2.pt"),
"hubert_tokenizer": os.path.join(self.CACHE_DIR, "tokenizer.pth"),
"hubert": os.path.join(self.CACHE_DIR, "hubert.pt"),
}
self.SMALL_REMOTE_MODEL_PATHS = {
"text": {"path": os.path.join(self.REMOTE_BASE_URL, "text.pt")},
"coarse": {"path": os.path.join(self.REMOTE_BASE_URL, "coarse.pt")},
"fine": {"path": os.path.join(self.REMOTE_BASE_URL, "fine.pt")},
}
self.sample_rate = self.SAMPLE_RATE # pylint: disable=attribute-defined-outside-init
+170
View File
@@ -0,0 +1,170 @@
from dataclasses import dataclass, field
from typing import List
from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.models.delightful_tts import DelightfulTtsArgs, DelightfulTtsAudioConfig, VocoderConfig
@dataclass
class DelightfulTTSConfig(BaseTTSConfig):
"""
Configuration class for the DelightfulTTS model.
Attributes:
model (str): Name of the model ("delightful_tts").
audio (DelightfulTtsAudioConfig): Configuration for audio settings.
model_args (DelightfulTtsArgs): Configuration for model arguments.
use_attn_priors (bool): Whether to use attention priors.
vocoder (VocoderConfig): Configuration for the vocoder.
init_discriminator (bool): Whether to initialize the discriminator.
steps_to_start_discriminator (int): Number of steps to start the discriminator.
grad_clip (List[float]): Gradient clipping values.
lr_gen (float): Learning rate for the gan generator.
lr_disc (float): Learning rate for the gan discriminator.
lr_scheduler_gen (str): Name of the learning rate scheduler for the generator.
lr_scheduler_gen_params (dict): Parameters for the learning rate scheduler for the generator.
lr_scheduler_disc (str): Name of the learning rate scheduler for the discriminator.
lr_scheduler_disc_params (dict): Parameters for the learning rate scheduler for the discriminator.
scheduler_after_epoch (bool): Whether to schedule after each epoch.
optimizer (str): Name of the optimizer.
optimizer_params (dict): Parameters for the optimizer.
ssim_loss_alpha (float): Alpha value for the SSIM loss.
mel_loss_alpha (float): Alpha value for the mel loss.
aligner_loss_alpha (float): Alpha value for the aligner loss.
pitch_loss_alpha (float): Alpha value for the pitch loss.
energy_loss_alpha (float): Alpha value for the energy loss.
u_prosody_loss_alpha (float): Alpha value for the utterance prosody loss.
p_prosody_loss_alpha (float): Alpha value for the phoneme prosody loss.
dur_loss_alpha (float): Alpha value for the duration loss.
char_dur_loss_alpha (float): Alpha value for the character duration loss.
binary_align_loss_alpha (float): Alpha value for the binary alignment loss.
binary_loss_warmup_epochs (int): Number of warm-up epochs for the binary loss.
disc_loss_alpha (float): Alpha value for the discriminator loss.
gen_loss_alpha (float): Alpha value for the generator loss.
feat_loss_alpha (float): Alpha value for the feature loss.
vocoder_mel_loss_alpha (float): Alpha value for the vocoder mel loss.
multi_scale_stft_loss_alpha (float): Alpha value for the multi-scale STFT loss.
multi_scale_stft_loss_params (dict): Parameters for the multi-scale STFT loss.
return_wav (bool): Whether to return audio waveforms.
use_weighted_sampler (bool): Whether to use a weighted sampler.
weighted_sampler_attrs (dict): Attributes for the weighted sampler.
weighted_sampler_multipliers (dict): Multipliers for the weighted sampler.
r (int): Value for the `r` override.
compute_f0 (bool): Whether to compute F0 values.
f0_cache_path (str): Path to the F0 cache.
attn_prior_cache_path (str): Path to the attention prior cache.
num_speakers (int): Number of speakers.
use_speaker_embedding (bool): Whether to use speaker embedding.
speakers_file (str): Path to the speaker file.
speaker_embedding_channels (int): Number of channels for the speaker embedding.
language_ids_file (str): Path to the language IDs file.
"""
model: str = "delightful_tts"
# model specific params
audio: DelightfulTtsAudioConfig = field(default_factory=DelightfulTtsAudioConfig)
model_args: DelightfulTtsArgs = field(default_factory=DelightfulTtsArgs)
use_attn_priors: bool = True
# vocoder
vocoder: VocoderConfig = field(default_factory=VocoderConfig)
init_discriminator: bool = True
# optimizer
steps_to_start_discriminator: int = 200000
grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
lr_gen: float = 0.0002
lr_disc: float = 0.0002
lr_scheduler_gen: str = "ExponentialLR"
lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
lr_scheduler_disc: str = "ExponentialLR"
lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
scheduler_after_epoch: bool = True
optimizer: str = "AdamW"
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "eps": 1e-9, "weight_decay": 0.01})
# acoustic model loss params
ssim_loss_alpha: float = 1.0
mel_loss_alpha: float = 1.0
aligner_loss_alpha: float = 1.0
pitch_loss_alpha: float = 1.0
energy_loss_alpha: float = 1.0
u_prosody_loss_alpha: float = 0.5
p_prosody_loss_alpha: float = 0.5
dur_loss_alpha: float = 1.0
char_dur_loss_alpha: float = 0.01
binary_align_loss_alpha: float = 0.1
binary_loss_warmup_epochs: int = 10
# vocoder loss params
disc_loss_alpha: float = 1.0
gen_loss_alpha: float = 1.0
feat_loss_alpha: float = 1.0
vocoder_mel_loss_alpha: float = 10.0
multi_scale_stft_loss_alpha: float = 2.5
multi_scale_stft_loss_params: dict = field(
default_factory=lambda: {
"n_ffts": [1024, 2048, 512],
"hop_lengths": [120, 240, 50],
"win_lengths": [600, 1200, 240],
}
)
# data loader params
return_wav: bool = True
use_weighted_sampler: bool = False
weighted_sampler_attrs: dict = field(default_factory=lambda: {})
weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
# overrides
r: int = 1
# dataset configs
compute_f0: bool = True
f0_cache_path: str = None
attn_prior_cache_path: str = None
# multi-speaker settings
# use speaker embedding layer
num_speakers: int = 0
use_speaker_embedding: bool = False
speakers_file: str = None
speaker_embedding_channels: int = 256
language_ids_file: str = None
use_language_embedding: bool = False
# use d-vectors
use_d_vector_file: bool = False
d_vector_file: str = None
d_vector_dim: int = None
# testing
test_sentences: List[List[str]] = field(
default_factory=lambda: [
["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
["Be a voice, not an echo."],
["I'm sorry Dave. I'm afraid I can't do that."],
["This cake is great. It's so delicious and moist."],
["Prior to November 22, 1963."],
]
)
def __post_init__(self):
# Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
if self.num_speakers > 0:
self.model_args.num_speakers = self.num_speakers
# speaker embedding settings
if self.use_speaker_embedding:
self.model_args.use_speaker_embedding = True
if self.speakers_file:
self.model_args.speakers_file = self.speakers_file
# d-vector settings
if self.use_d_vector_file:
self.model_args.use_d_vector_file = True
if self.d_vector_dim is not None and self.d_vector_dim > 0:
self.model_args.d_vector_dim = self.d_vector_dim
if self.d_vector_file:
self.model_args.d_vector_file = self.d_vector_file
+183
View File
@@ -0,0 +1,183 @@
from dataclasses import dataclass, field
from typing import List
from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.models.forward_tts import ForwardTTSArgs
@dataclass
class FastPitchConfig(BaseTTSConfig):
"""Configure `ForwardTTS` as FastPitch model.
Example:
>>> from TTS.tts.configs.fast_pitch_config import FastPitchConfig
>>> config = FastPitchConfig()
Args:
model (str):
Model name used for selecting the right model at initialization. Defaults to `fast_pitch`.
base_model (str):
Name of the base model being configured as this model so that 🐸 TTS knows it needs to initiate
the base model rather than searching for the `model` implementation. Defaults to `forward_tts`.
model_args (Coqpit):
Model class arguments. Check `FastPitchArgs` for more details. Defaults to `FastPitchArgs()`.
data_dep_init_steps (int):
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
for the rest. Defaults to 10.
speakers_file (str):
Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
speaker names. Defaults to `None`.
use_speaker_embedding (bool):
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
in the multi-speaker mode. Defaults to False.
use_d_vector_file (bool):
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
d_vector_file (str):
Path to the file including pre-computed speaker embeddings. Defaults to None.
d_vector_dim (int):
Dimension of the external speaker embeddings. Defaults to 0.
optimizer (str):
Name of the model optimizer. Defaults to `Adam`.
optimizer_params (dict):
Arguments of the model optimizer. Defaults to `{"betas": [0.9, 0.998], "weight_decay": 1e-6}`.
lr_scheduler (str):
Name of the learning rate scheduler. Defaults to `Noam`.
lr_scheduler_params (dict):
Arguments of the learning rate scheduler. Defaults to `{"warmup_steps": 4000}`.
lr (float):
Initial learning rate. Defaults to `1e-3`.
grad_clip (float):
Gradient norm clipping value. Defaults to `5.0`.
spec_loss_type (str):
Type of the spectrogram loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
duration_loss_type (str):
Type of the duration loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
use_ssim_loss (bool):
Enable/disable the use of SSIM (Structural Similarity) loss. Defaults to True.
wd (float):
Weight decay coefficient. Defaults to `1e-7`.
ssim_loss_alpha (float):
Weight for the SSIM loss. If set 0, disables the SSIM loss. Defaults to 1.0.
dur_loss_alpha (float):
Weight for the duration predictor's loss. If set 0, disables the huber loss. Defaults to 1.0.
spec_loss_alpha (float):
Weight for the L1 spectrogram loss. If set 0, disables the L1 loss. Defaults to 1.0.
pitch_loss_alpha (float):
Weight for the pitch predictor's loss. If set 0, disables the pitch predictor. Defaults to 1.0.
binary_align_loss_alpha (float):
Weight for the binary loss. If set 0, disables the binary loss. Defaults to 1.0.
binary_loss_warmup_epochs (float):
Number of epochs to gradually increase the binary loss impact. Defaults to 150.
min_seq_len (int):
Minimum input sequence length to be used at training.
max_seq_len (int):
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
# dataset configs
compute_f0(bool):
Compute pitch. defaults to True
f0_cache_path(str):
pith cache path. defaults to None
"""
model: str = "fast_pitch"
base_model: str = "forward_tts"
# model specific params
model_args: ForwardTTSArgs = field(default_factory=ForwardTTSArgs)
# multi-speaker settings
num_speakers: int = 0
speakers_file: str = None
use_speaker_embedding: bool = False
use_d_vector_file: bool = False
d_vector_file: str = False
d_vector_dim: int = 0
# optimizer parameters
optimizer: str = "Adam"
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
lr_scheduler: str = "NoamLR"
lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000})
lr: float = 1e-4
grad_clip: float = 5.0
# loss params
spec_loss_type: str = "mse"
duration_loss_type: str = "mse"
use_ssim_loss: bool = True
ssim_loss_alpha: float = 1.0
spec_loss_alpha: float = 1.0
aligner_loss_alpha: float = 1.0
pitch_loss_alpha: float = 0.1
dur_loss_alpha: float = 0.1
binary_align_loss_alpha: float = 0.1
binary_loss_warmup_epochs: int = 150
# overrides
min_seq_len: int = 13
max_seq_len: int = 200
r: int = 1 # DO NOT CHANGE
# dataset configs
compute_f0: bool = True
f0_cache_path: str = None
# testing
test_sentences: List[str] = field(
default_factory=lambda: [
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
"Be a voice, not an echo.",
"I'm sorry Dave. I'm afraid I can't do that.",
"This cake is great. It's so delicious and moist.",
"Prior to November 22, 1963.",
]
)
def __post_init__(self):
# Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
if self.num_speakers > 0:
self.model_args.num_speakers = self.num_speakers
# speaker embedding settings
if self.use_speaker_embedding:
self.model_args.use_speaker_embedding = True
if self.speakers_file:
self.model_args.speakers_file = self.speakers_file
# d-vector settings
if self.use_d_vector_file:
self.model_args.use_d_vector_file = True
if self.d_vector_dim is not None and self.d_vector_dim > 0:
self.model_args.d_vector_dim = self.d_vector_dim
if self.d_vector_file:
self.model_args.d_vector_file = self.d_vector_file
+177
View File
@@ -0,0 +1,177 @@
from dataclasses import dataclass, field
from typing import List
from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.models.forward_tts import ForwardTTSArgs
@dataclass
class FastSpeechConfig(BaseTTSConfig):
"""Configure `ForwardTTS` as FastSpeech model.
Example:
>>> from TTS.tts.configs.fast_speech_config import FastSpeechConfig
>>> config = FastSpeechConfig()
Args:
model (str):
Model name used for selecting the right model at initialization. Defaults to `fast_pitch`.
base_model (str):
Name of the base model being configured as this model so that 🐸 TTS knows it needs to initiate
the base model rather than searching for the `model` implementation. Defaults to `forward_tts`.
model_args (Coqpit):
Model class arguments. Check `FastSpeechArgs` for more details. Defaults to `FastSpeechArgs()`.
data_dep_init_steps (int):
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
for the rest. Defaults to 10.
speakers_file (str):
Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
speaker names. Defaults to `None`.
use_speaker_embedding (bool):
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
in the multi-speaker mode. Defaults to False.
use_d_vector_file (bool):
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
d_vector_file (str):
Path to the file including pre-computed speaker embeddings. Defaults to None.
d_vector_dim (int):
Dimension of the external speaker embeddings. Defaults to 0.
optimizer (str):
Name of the model optimizer. Defaults to `Adam`.
optimizer_params (dict):
Arguments of the model optimizer. Defaults to `{"betas": [0.9, 0.998], "weight_decay": 1e-6}`.
lr_scheduler (str):
Name of the learning rate scheduler. Defaults to `Noam`.
lr_scheduler_params (dict):
Arguments of the learning rate scheduler. Defaults to `{"warmup_steps": 4000}`.
lr (float):
Initial learning rate. Defaults to `1e-3`.
grad_clip (float):
Gradient norm clipping value. Defaults to `5.0`.
spec_loss_type (str):
Type of the spectrogram loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
duration_loss_type (str):
Type of the duration loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
use_ssim_loss (bool):
Enable/disable the use of SSIM (Structural Similarity) loss. Defaults to True.
wd (float):
Weight decay coefficient. Defaults to `1e-7`.
ssim_loss_alpha (float):
Weight for the SSIM loss. If set 0, disables the SSIM loss. Defaults to 1.0.
dur_loss_alpha (float):
Weight for the duration predictor's loss. If set 0, disables the huber loss. Defaults to 1.0.
spec_loss_alpha (float):
Weight for the L1 spectrogram loss. If set 0, disables the L1 loss. Defaults to 1.0.
pitch_loss_alpha (float):
Weight for the pitch predictor's loss. If set 0, disables the pitch predictor. Defaults to 1.0.
binary_loss_alpha (float):
Weight for the binary loss. If set 0, disables the binary loss. Defaults to 1.0.
binary_loss_warmup_epochs (float):
Number of epochs to gradually increase the binary loss impact. Defaults to 150.
min_seq_len (int):
Minimum input sequence length to be used at training.
max_seq_len (int):
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
"""
model: str = "fast_speech"
base_model: str = "forward_tts"
# model specific params
model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=False))
# multi-speaker settings
num_speakers: int = 0
speakers_file: str = None
use_speaker_embedding: bool = False
use_d_vector_file: bool = False
d_vector_file: str = False
d_vector_dim: int = 0
# optimizer parameters
optimizer: str = "Adam"
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
lr_scheduler: str = "NoamLR"
lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000})
lr: float = 1e-4
grad_clip: float = 5.0
# loss params
spec_loss_type: str = "mse"
duration_loss_type: str = "mse"
use_ssim_loss: bool = True
ssim_loss_alpha: float = 1.0
dur_loss_alpha: float = 1.0
spec_loss_alpha: float = 1.0
pitch_loss_alpha: float = 0.0
aligner_loss_alpha: float = 1.0
binary_align_loss_alpha: float = 1.0
binary_loss_warmup_epochs: int = 150
# overrides
min_seq_len: int = 13
max_seq_len: int = 200
r: int = 1 # DO NOT CHANGE
# dataset configs
compute_f0: bool = False
f0_cache_path: str = None
# testing
test_sentences: List[str] = field(
default_factory=lambda: [
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
"Be a voice, not an echo.",
"I'm sorry Dave. I'm afraid I can't do that.",
"This cake is great. It's so delicious and moist.",
"Prior to November 22, 1963.",
]
)
def __post_init__(self):
# Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
if self.num_speakers > 0:
self.model_args.num_speakers = self.num_speakers
# speaker embedding settings
if self.use_speaker_embedding:
self.model_args.use_speaker_embedding = True
if self.speakers_file:
self.model_args.speakers_file = self.speakers_file
# d-vector settings
if self.use_d_vector_file:
self.model_args.use_d_vector_file = True
if self.d_vector_dim is not None and self.d_vector_dim > 0:
self.model_args.d_vector_dim = self.d_vector_dim
if self.d_vector_file:
self.model_args.d_vector_file = self.d_vector_file
+198
View File
@@ -0,0 +1,198 @@
from dataclasses import dataclass, field
from typing import List
from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.models.forward_tts import ForwardTTSArgs
@dataclass
class Fastspeech2Config(BaseTTSConfig):
"""Configure `ForwardTTS` as FastPitch model.
Example:
>>> from TTS.tts.configs.fastspeech2_config import FastSpeech2Config
>>> config = FastSpeech2Config()
Args:
model (str):
Model name used for selecting the right model at initialization. Defaults to `fast_pitch`.
base_model (str):
Name of the base model being configured as this model so that 🐸 TTS knows it needs to initiate
the base model rather than searching for the `model` implementation. Defaults to `forward_tts`.
model_args (Coqpit):
Model class arguments. Check `FastPitchArgs` for more details. Defaults to `FastPitchArgs()`.
data_dep_init_steps (int):
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
for the rest. Defaults to 10.
speakers_file (str):
Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
speaker names. Defaults to `None`.
use_speaker_embedding (bool):
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
in the multi-speaker mode. Defaults to False.
use_d_vector_file (bool):
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
d_vector_file (str):
Path to the file including pre-computed speaker embeddings. Defaults to None.
d_vector_dim (int):
Dimension of the external speaker embeddings. Defaults to 0.
optimizer (str):
Name of the model optimizer. Defaults to `Adam`.
optimizer_params (dict):
Arguments of the model optimizer. Defaults to `{"betas": [0.9, 0.998], "weight_decay": 1e-6}`.
lr_scheduler (str):
Name of the learning rate scheduler. Defaults to `Noam`.
lr_scheduler_params (dict):
Arguments of the learning rate scheduler. Defaults to `{"warmup_steps": 4000}`.
lr (float):
Initial learning rate. Defaults to `1e-3`.
grad_clip (float):
Gradient norm clipping value. Defaults to `5.0`.
spec_loss_type (str):
Type of the spectrogram loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
duration_loss_type (str):
Type of the duration loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
use_ssim_loss (bool):
Enable/disable the use of SSIM (Structural Similarity) loss. Defaults to True.
wd (float):
Weight decay coefficient. Defaults to `1e-7`.
ssim_loss_alpha (float):
Weight for the SSIM loss. If set 0, disables the SSIM loss. Defaults to 1.0.
dur_loss_alpha (float):
Weight for the duration predictor's loss. If set 0, disables the huber loss. Defaults to 1.0.
spec_loss_alpha (float):
Weight for the L1 spectrogram loss. If set 0, disables the L1 loss. Defaults to 1.0.
pitch_loss_alpha (float):
Weight for the pitch predictor's loss. If set 0, disables the pitch predictor. Defaults to 1.0.
energy_loss_alpha (float):
Weight for the energy predictor's loss. If set 0, disables the energy predictor. Defaults to 1.0.
binary_align_loss_alpha (float):
Weight for the binary loss. If set 0, disables the binary loss. Defaults to 1.0.
binary_loss_warmup_epochs (float):
Number of epochs to gradually increase the binary loss impact. Defaults to 150.
min_seq_len (int):
Minimum input sequence length to be used at training.
max_seq_len (int):
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
# dataset configs
compute_f0(bool):
Compute pitch. defaults to True
f0_cache_path(str):
pith cache path. defaults to None
# dataset configs
compute_energy(bool):
Compute energy. defaults to True
energy_cache_path(str):
energy cache path. defaults to None
"""
model: str = "fastspeech2"
base_model: str = "forward_tts"
# model specific params
model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=True, use_energy=True))
# multi-speaker settings
num_speakers: int = 0
speakers_file: str = None
use_speaker_embedding: bool = False
use_d_vector_file: bool = False
d_vector_file: str = False
d_vector_dim: int = 0
# optimizer parameters
optimizer: str = "Adam"
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
lr_scheduler: str = "NoamLR"
lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000})
lr: float = 1e-4
grad_clip: float = 5.0
# loss params
spec_loss_type: str = "mse"
duration_loss_type: str = "mse"
use_ssim_loss: bool = True
ssim_loss_alpha: float = 1.0
spec_loss_alpha: float = 1.0
aligner_loss_alpha: float = 1.0
pitch_loss_alpha: float = 0.1
energy_loss_alpha: float = 0.1
dur_loss_alpha: float = 0.1
binary_align_loss_alpha: float = 0.1
binary_loss_warmup_epochs: int = 150
# overrides
min_seq_len: int = 13
max_seq_len: int = 200
r: int = 1 # DO NOT CHANGE
# dataset configs
compute_f0: bool = True
f0_cache_path: str = None
# dataset configs
compute_energy: bool = True
energy_cache_path: str = None
# testing
test_sentences: List[str] = field(
default_factory=lambda: [
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
"Be a voice, not an echo.",
"I'm sorry Dave. I'm afraid I can't do that.",
"This cake is great. It's so delicious and moist.",
"Prior to November 22, 1963.",
]
)
def __post_init__(self):
# Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
if self.num_speakers > 0:
self.model_args.num_speakers = self.num_speakers
# speaker embedding settings
if self.use_speaker_embedding:
self.model_args.use_speaker_embedding = True
if self.speakers_file:
self.model_args.speakers_file = self.speakers_file
# d-vector settings
if self.use_d_vector_file:
self.model_args.use_d_vector_file = True
if self.d_vector_dim is not None and self.d_vector_dim > 0:
self.model_args.d_vector_dim = self.d_vector_dim
if self.d_vector_file:
self.model_args.d_vector_file = self.d_vector_file
+182
View File
@@ -0,0 +1,182 @@
from dataclasses import dataclass, field
from typing import List
from TTS.tts.configs.shared_configs import BaseTTSConfig
@dataclass
class GlowTTSConfig(BaseTTSConfig):
"""Defines parameters for GlowTTS model.
Example:
>>> from TTS.tts.configs.glow_tts_config import GlowTTSConfig
>>> config = GlowTTSConfig()
Args:
model(str):
Model name used for selecting the right model at initialization. Defaults to `glow_tts`.
encoder_type (str):
Type of the encoder used by the model. Look at `TTS.tts.layers.glow_tts.encoder` for more details.
Defaults to `rel_pos_transformers`.
encoder_params (dict):
Parameters used to define the encoder network. Look at `TTS.tts.layers.glow_tts.encoder` for more details.
Defaults to `{"kernel_size": 3, "dropout_p": 0.1, "num_layers": 6, "num_heads": 2, "hidden_channels_ffn": 768}`
use_encoder_prenet (bool):
enable / disable the use of a prenet for the encoder. Defaults to True.
hidden_channels_enc (int):
Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes,
and for some encoder types internal hidden channels sizes too. Defaults to 192.
hidden_channels_dec (int):
Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work.
hidden_channels_dp (int):
Number of layer channels of the duration predictor network. Defaults to 256 as in the original work.
mean_only (bool):
If true predict only the mean values by the decoder flow. Defaults to True.
out_channels (int):
Number of channels of the model output tensor. Defaults to 80.
num_flow_blocks_dec (int):
Number of decoder blocks. Defaults to 12.
inference_noise_scale (float):
Noise scale used at inference. Defaults to 0.33.
kernel_size_dec (int):
Decoder kernel size. Defaults to 5
dilation_rate (int):
Rate to increase dilation by each layer in a decoder block. Defaults to 1.
num_block_layers (int):
Number of decoder layers in each decoder block. Defaults to 4.
dropout_p_dec (float):
Dropout rate for decoder. Defaults to 0.1.
num_speaker (int):
Number of speaker to define the size of speaker embedding layer. Defaults to 0.
c_in_channels (int):
Number of speaker embedding channels. It is set to 512 if embeddings are learned. Defaults to 0.
num_splits (int):
Number of split levels in inversible conv1x1 operation. Defaults to 4.
num_squeeze (int):
Number of squeeze levels. When squeezing channels increases and time steps reduces by the factor
'num_squeeze'. Defaults to 2.
sigmoid_scale (bool):
enable/disable sigmoid scaling in decoder. Defaults to False.
mean_only (bool):
If True, encoder only computes mean value and uses constant variance for each time step. Defaults to true.
encoder_type (str):
Encoder module type. Possible values are`["rel_pos_transformer", "gated_conv", "residual_conv_bn", "time_depth_separable"]`
Check `TTS.tts.layers.glow_tts.encoder` for more details. Defaults to `rel_pos_transformers` as in the original paper.
encoder_params (dict):
Encoder module parameters. Defaults to None.
d_vector_dim (int):
Channels of external speaker embedding vectors. Defaults to 0.
data_dep_init_steps (int):
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
for the rest. Defaults to 10.
style_wav_for_test (str):
Path to the wav file used for changing the style of the speech. Defaults to None.
inference_noise_scale (float):
Variance used for sampling the random noise added to the decoder's input at inference. Defaults to 0.0.
length_scale (float):
Multiply the predicted durations with this value to change the speech speed. Defaults to 1.
use_speaker_embedding (bool):
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
in the multi-speaker mode. Defaults to False.
use_d_vector_file (bool):
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
d_vector_file (str):
Path to the file including pre-computed speaker embeddings. Defaults to None.
noam_schedule (bool):
enable / disable the use of Noam LR scheduler. Defaults to False.
warmup_steps (int):
Number of warm-up steps for the Noam scheduler. Defaults 4000.
lr (float):
Initial learning rate. Defaults to `1e-3`.
wd (float):
Weight decay coefficient. Defaults to `1e-7`.
min_seq_len (int):
Minimum input sequence length to be used at training.
max_seq_len (int):
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
"""
model: str = "glow_tts"
# model params
num_chars: int = None
encoder_type: str = "rel_pos_transformer"
encoder_params: dict = field(
default_factory=lambda: {
"kernel_size": 3,
"dropout_p": 0.1,
"num_layers": 6,
"num_heads": 2,
"hidden_channels_ffn": 768,
}
)
use_encoder_prenet: bool = True
hidden_channels_enc: int = 192
hidden_channels_dec: int = 192
hidden_channels_dp: int = 256
dropout_p_dp: float = 0.1
dropout_p_dec: float = 0.05
mean_only: bool = True
out_channels: int = 80
num_flow_blocks_dec: int = 12
inference_noise_scale: float = 0.33
kernel_size_dec: int = 5
dilation_rate: int = 1
num_block_layers: int = 4
num_speakers: int = 0
c_in_channels: int = 0
num_splits: int = 4
num_squeeze: int = 2
sigmoid_scale: bool = False
encoder_type: str = "rel_pos_transformer"
encoder_params: dict = field(
default_factory=lambda: {
"kernel_size": 3,
"dropout_p": 0.1,
"num_layers": 6,
"num_heads": 2,
"hidden_channels_ffn": 768,
"input_length": None,
}
)
d_vector_dim: int = 0
# training params
data_dep_init_steps: int = 10
# inference params
style_wav_for_test: str = None
inference_noise_scale: float = 0.0
length_scale: float = 1.0
# multi-speaker settings
use_speaker_embedding: bool = False
speakers_file: str = None
use_d_vector_file: bool = False
d_vector_file: str = False
# optimizer parameters
optimizer: str = "RAdam"
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
lr_scheduler: str = "NoamLR"
lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000})
grad_clip: float = 5.0
lr: float = 1e-3
# overrides
min_seq_len: int = 3
max_seq_len: int = 500
r: int = 1 # DO NOT CHANGE - TODO: make this immutable once coqpit implements it.
# testing
test_sentences: List[str] = field(
default_factory=lambda: [
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
"Be a voice, not an echo.",
"I'm sorry Dave. I'm afraid I can't do that.",
"This cake is great. It's so delicious and moist.",
"Prior to November 22, 1963.",
]
)
+170
View File
@@ -0,0 +1,170 @@
from dataclasses import dataclass, field
from typing import List
from TTS.tts.configs.shared_configs import BaseTTSConfig
@dataclass
class NeuralhmmTTSConfig(BaseTTSConfig):
"""
Define parameters for Neural HMM TTS model.
Example:
>>> from TTS.tts.configs.overflow_config import OverflowConfig
>>> config = OverflowConfig()
Args:
model (str):
Model name used to select the right model class to initilize. Defaults to `Overflow`.
run_eval_steps (int):
Run evalulation epoch after N steps. If None, waits until training epoch is completed. Defaults to None.
save_step (int):
Save local checkpoint every save_step steps. Defaults to 500.
plot_step (int):
Plot training stats on the logger every plot_step steps. Defaults to 1.
model_param_stats (bool):
Log model parameters stats on the logger dashboard. Defaults to False.
force_generate_statistics (bool):
Force generate mel normalization statistics. Defaults to False.
mel_statistics_parameter_path (str):
Path to the mel normalization statistics.If the model doesn't finds a file there it will generate statistics.
Defaults to None.
num_chars (int):
Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
state_per_phone (int):
Generates N states per phone. Similar, to `add_blank` parameter in GlowTTS but in Overflow it is upsampled by model's encoder. Defaults to 2.
encoder_in_out_features (int):
Channels of encoder input and character embedding tensors. Defaults to 512.
encoder_n_convolutions (int):
Number of convolution layers in the encoder. Defaults to 3.
out_channels (int):
Channels of the final model output. It must match the spectragram size. Defaults to 80.
ar_order (int):
Autoregressive order of the model. Defaults to 1. In ablations of Neural HMM it was found that more autoregression while giving more variation hurts naturalness of the synthesised audio.
sampling_temp (float):
Variation added to the sample from the latent space of neural HMM. Defaults to 0.334.
deterministic_transition (bool):
deterministic duration generation based on duration quantiles as defiend in "S. Ronanki, O. Watts, S. King, and G. E. Henter, “Medianbased generation of synthetic speech durations using a nonparametric approach,” in Proc. SLT, 2016.". Defaults to True.
duration_threshold (float):
Threshold for duration quantiles. Defaults to 0.55. Tune this to change the speaking rate of the synthesis, where lower values defines a slower speaking rate and higher values defines a faster speaking rate.
use_grad_checkpointing (bool):
Use gradient checkpointing to save memory. In a multi-GPU setting currently pytorch does not supports gradient checkpoint inside a loop so we will have to turn it off then.Adjust depending on whatever get more batch size either by using a single GPU or multi-GPU. Defaults to True.
max_sampling_time (int):
Maximum sampling time while synthesising latents from neural HMM. Defaults to 1000.
prenet_type (str):
`original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
Prenet. Defaults to `original`.
prenet_dim (int):
Dimension of the Prenet. Defaults to 256.
prenet_n_layers (int):
Number of layers in the Prenet. Defaults to 2.
prenet_dropout (float):
Dropout rate of the Prenet. Defaults to 0.5.
prenet_dropout_at_inference (bool):
Use dropout at inference time. Defaults to False.
memory_rnn_dim (int):
Dimension of the memory LSTM to process the prenet output. Defaults to 1024.
outputnet_size (list[int]):
Size of the output network inside the neural HMM. Defaults to [1024].
flat_start_params (dict):
Parameters for the flat start initialization of the neural HMM. Defaults to `{"mean": 0.0, "std": 1.0, "transition_p": 0.14}`.
It will be recomputed when you pass the dataset.
std_floor (float):
Floor value for the standard deviation of the neural HMM. Prevents model cheating by putting point mass and getting infinite likelihood at any datapoint. Defaults to 0.01.
It is called `variance flooring` in standard HMM literature.
optimizer (str):
Optimizer to use for training. Defaults to `adam`.
optimizer_params (dict):
Parameters for the optimizer. Defaults to `{"weight_decay": 1e-6}`.
grad_clip (float):
Gradient clipping threshold. Defaults to 40_000.
lr (float):
Learning rate. Defaults to 1e-3.
lr_scheduler (str):
Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or
`TTS.utils.training`. Defaults to `None`.
min_seq_len (int):
Minimum input sequence length to be used at training.
max_seq_len (int):
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
"""
model: str = "NeuralHMM_TTS"
# Training and Checkpoint configs
run_eval_steps: int = 100
save_step: int = 500
plot_step: int = 1
model_param_stats: bool = False
# data parameters
force_generate_statistics: bool = False
mel_statistics_parameter_path: str = None
# Encoder parameters
num_chars: int = None
state_per_phone: int = 2
encoder_in_out_features: int = 512
encoder_n_convolutions: int = 3
# HMM parameters
out_channels: int = 80
ar_order: int = 1
sampling_temp: float = 0
deterministic_transition: bool = True
duration_threshold: float = 0.43
use_grad_checkpointing: bool = True
max_sampling_time: int = 1000
## Prenet parameters
prenet_type: str = "original"
prenet_dim: int = 256
prenet_n_layers: int = 2
prenet_dropout: float = 0.5
prenet_dropout_at_inference: bool = True
memory_rnn_dim: int = 1024
## Outputnet parameters
outputnet_size: List[int] = field(default_factory=lambda: [1024])
flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14})
std_floor: float = 0.001
# optimizer parameters
optimizer: str = "Adam"
optimizer_params: dict = field(default_factory=lambda: {"weight_decay": 1e-6})
grad_clip: float = 40000.0
lr: float = 1e-3
lr_scheduler: str = None
# overrides
min_text_len: int = 10
max_text_len: int = 500
min_audio_len: int = 512
# testing
test_sentences: List[str] = field(
default_factory=lambda: [
"Be a voice, not an echo.",
]
)
# Extra needed config
r: int = 1
use_d_vector_file: bool = False
use_speaker_embedding: bool = False
def check_values(self):
"""Validate the hyperparameters.
Raises:
AssertionError: when the parameters network is not defined
AssertionError: transition probability is not between 0 and 1
"""
assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model."
assert (
len(self.outputnet_size) >= 1
), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}"
assert (
0 < self.flat_start_params["transition_p"] < 1
), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}"
+201
View File
@@ -0,0 +1,201 @@
from dataclasses import dataclass, field
from typing import List
from TTS.tts.configs.shared_configs import BaseTTSConfig
@dataclass
class OverflowConfig(BaseTTSConfig): # The classname has to be camel case
"""
Define parameters for OverFlow model.
Example:
>>> from TTS.tts.configs.overflow_config import OverflowConfig
>>> config = OverflowConfig()
Args:
model (str):
Model name used to select the right model class to initilize. Defaults to `Overflow`.
run_eval_steps (int):
Run evalulation epoch after N steps. If None, waits until training epoch is completed. Defaults to None.
save_step (int):
Save local checkpoint every save_step steps. Defaults to 500.
plot_step (int):
Plot training stats on the logger every plot_step steps. Defaults to 1.
model_param_stats (bool):
Log model parameters stats on the logger dashboard. Defaults to False.
force_generate_statistics (bool):
Force generate mel normalization statistics. Defaults to False.
mel_statistics_parameter_path (str):
Path to the mel normalization statistics.If the model doesn't finds a file there it will generate statistics.
Defaults to None.
num_chars (int):
Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
state_per_phone (int):
Generates N states per phone. Similar, to `add_blank` parameter in GlowTTS but in Overflow it is upsampled by model's encoder. Defaults to 2.
encoder_in_out_features (int):
Channels of encoder input and character embedding tensors. Defaults to 512.
encoder_n_convolutions (int):
Number of convolution layers in the encoder. Defaults to 3.
out_channels (int):
Channels of the final model output. It must match the spectragram size. Defaults to 80.
ar_order (int):
Autoregressive order of the model. Defaults to 1. In ablations of Neural HMM it was found that more autoregression while giving more variation hurts naturalness of the synthesised audio.
sampling_temp (float):
Variation added to the sample from the latent space of neural HMM. Defaults to 0.334.
deterministic_transition (bool):
deterministic duration generation based on duration quantiles as defiend in "S. Ronanki, O. Watts, S. King, and G. E. Henter, “Medianbased generation of synthetic speech durations using a nonparametric approach,” in Proc. SLT, 2016.". Defaults to True.
duration_threshold (float):
Threshold for duration quantiles. Defaults to 0.55. Tune this to change the speaking rate of the synthesis, where lower values defines a slower speaking rate and higher values defines a faster speaking rate.
use_grad_checkpointing (bool):
Use gradient checkpointing to save memory. In a multi-GPU setting currently pytorch does not supports gradient checkpoint inside a loop so we will have to turn it off then.Adjust depending on whatever get more batch size either by using a single GPU or multi-GPU. Defaults to True.
max_sampling_time (int):
Maximum sampling time while synthesising latents from neural HMM. Defaults to 1000.
prenet_type (str):
`original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
Prenet. Defaults to `original`.
prenet_dim (int):
Dimension of the Prenet. Defaults to 256.
prenet_n_layers (int):
Number of layers in the Prenet. Defaults to 2.
prenet_dropout (float):
Dropout rate of the Prenet. Defaults to 0.5.
prenet_dropout_at_inference (bool):
Use dropout at inference time. Defaults to False.
memory_rnn_dim (int):
Dimension of the memory LSTM to process the prenet output. Defaults to 1024.
outputnet_size (list[int]):
Size of the output network inside the neural HMM. Defaults to [1024].
flat_start_params (dict):
Parameters for the flat start initialization of the neural HMM. Defaults to `{"mean": 0.0, "std": 1.0, "transition_p": 0.14}`.
It will be recomputed when you pass the dataset.
std_floor (float):
Floor value for the standard deviation of the neural HMM. Prevents model cheating by putting point mass and getting infinite likelihood at any datapoint. Defaults to 0.01.
It is called `variance flooring` in standard HMM literature.
hidden_channels_dec (int):
Number of base hidden channels used by the decoder WaveNet network. Defaults to 150.
kernel_size_dec (int):
Decoder kernel size. Defaults to 5
dilation_rate (int):
Rate to increase dilation by each layer in a decoder block. Defaults to 1.
num_flow_blocks_dec (int):
Number of decoder layers in each decoder block. Defaults to 4.
dropout_p_dec (float):
Dropout rate of the decoder. Defaults to 0.05.
num_splits (int):
Number of split levels in inversible conv1x1 operation. Defaults to 4.
num_squeeze (int):
Number of squeeze levels. When squeezing channels increases and time steps reduces by the factor
'num_squeeze'. Defaults to 2.
sigmoid_scale (bool):
enable/disable sigmoid scaling in decoder. Defaults to False.
c_in_channels (int):
Unused parameter from GlowTTS's decoder. Defaults to 0.
optimizer (str):
Optimizer to use for training. Defaults to `adam`.
optimizer_params (dict):
Parameters for the optimizer. Defaults to `{"weight_decay": 1e-6}`.
grad_clip (float):
Gradient clipping threshold. Defaults to 40_000.
lr (float):
Learning rate. Defaults to 1e-3.
lr_scheduler (str):
Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or
`TTS.utils.training`. Defaults to `None`.
min_seq_len (int):
Minimum input sequence length to be used at training.
max_seq_len (int):
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
"""
model: str = "Overflow"
# Training and Checkpoint configs
run_eval_steps: int = 100
save_step: int = 500
plot_step: int = 1
model_param_stats: bool = False
# data parameters
force_generate_statistics: bool = False
mel_statistics_parameter_path: str = None
# Encoder parameters
num_chars: int = None
state_per_phone: int = 2
encoder_in_out_features: int = 512
encoder_n_convolutions: int = 3
# HMM parameters
out_channels: int = 80
ar_order: int = 1
sampling_temp: float = 0.334
deterministic_transition: bool = True
duration_threshold: float = 0.55
use_grad_checkpointing: bool = True
max_sampling_time: int = 1000
## Prenet parameters
prenet_type: str = "original"
prenet_dim: int = 256
prenet_n_layers: int = 2
prenet_dropout: float = 0.5
prenet_dropout_at_inference: bool = False
memory_rnn_dim: int = 1024
## Outputnet parameters
outputnet_size: List[int] = field(default_factory=lambda: [1024])
flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14})
std_floor: float = 0.01
# Decoder parameters
hidden_channels_dec: int = 150
kernel_size_dec: int = 5
dilation_rate: int = 1
num_flow_blocks_dec: int = 12
num_block_layers: int = 4
dropout_p_dec: float = 0.05
num_splits: int = 4
num_squeeze: int = 2
sigmoid_scale: bool = False
c_in_channels: int = 0
# optimizer parameters
optimizer: str = "Adam"
optimizer_params: dict = field(default_factory=lambda: {"weight_decay": 1e-6})
grad_clip: float = 40000.0
lr: float = 1e-3
lr_scheduler: str = None
# overrides
min_text_len: int = 10
max_text_len: int = 500
min_audio_len: int = 512
# testing
test_sentences: List[str] = field(
default_factory=lambda: [
"Be a voice, not an echo.",
]
)
# Extra needed config
r: int = 1
use_d_vector_file: bool = False
use_speaker_embedding: bool = False
def check_values(self):
"""Validate the hyperparameters.
Raises:
AssertionError: when the parameters network is not defined
AssertionError: transition probability is not between 0 and 1
"""
assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model."
assert (
len(self.outputnet_size) >= 1
), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}"
assert (
0 < self.flat_start_params["transition_p"] < 1
), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}"
+344
View File
@@ -0,0 +1,344 @@
from dataclasses import asdict, dataclass, field
from typing import Dict, List
from coqpit import Coqpit, check_argument
from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
@dataclass
class GSTConfig(Coqpit):
"""Defines the Global Style Token Module
Args:
gst_style_input_wav (str):
Path to the wav file used to define the style of the output speech at inference. Defaults to None.
gst_style_input_weights (dict):
Defines the weights for each style token used at inference. Defaults to None.
gst_embedding_dim (int):
Defines the size of the GST embedding vector dimensions. Defaults to 256.
gst_num_heads (int):
Number of attention heads used by the multi-head attention. Defaults to 4.
gst_num_style_tokens (int):
Number of style token vectors. Defaults to 10.
"""
gst_style_input_wav: str = None
gst_style_input_weights: dict = None
gst_embedding_dim: int = 256
gst_use_speaker_embedding: bool = False
gst_num_heads: int = 4
gst_num_style_tokens: int = 10
def check_values(
self,
):
"""Check config fields"""
c = asdict(self)
super().check_values()
check_argument("gst_style_input_weights", c, restricted=False)
check_argument("gst_style_input_wav", c, restricted=False)
check_argument("gst_embedding_dim", c, restricted=True, min_val=0, max_val=1000)
check_argument("gst_use_speaker_embedding", c, restricted=False)
check_argument("gst_num_heads", c, restricted=True, min_val=2, max_val=10)
check_argument("gst_num_style_tokens", c, restricted=True, min_val=1, max_val=1000)
@dataclass
class CapacitronVAEConfig(Coqpit):
"""Defines the capacitron VAE Module
Args:
capacitron_capacity (int):
Defines the variational capacity limit of the prosody embeddings. Defaults to 150.
capacitron_VAE_embedding_dim (int):
Defines the size of the Capacitron embedding vector dimension. Defaults to 128.
capacitron_use_text_summary_embeddings (bool):
If True, use a text summary embedding in Capacitron. Defaults to True.
capacitron_text_summary_embedding_dim (int):
Defines the size of the capacitron text embedding vector dimension. Defaults to 128.
capacitron_use_speaker_embedding (bool):
if True use speaker embeddings in Capacitron. Defaults to False.
capacitron_VAE_loss_alpha (float):
Weight for the VAE loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
capacitron_grad_clip (float):
Gradient clipping value for all gradients except beta. Defaults to 5.0
"""
capacitron_loss_alpha: int = 1
capacitron_capacity: int = 150
capacitron_VAE_embedding_dim: int = 128
capacitron_use_text_summary_embeddings: bool = True
capacitron_text_summary_embedding_dim: int = 128
capacitron_use_speaker_embedding: bool = False
capacitron_VAE_loss_alpha: float = 0.25
capacitron_grad_clip: float = 5.0
def check_values(
self,
):
"""Check config fields"""
c = asdict(self)
super().check_values()
check_argument("capacitron_capacity", c, restricted=True, min_val=10, max_val=500)
check_argument("capacitron_VAE_embedding_dim", c, restricted=True, min_val=16, max_val=1024)
check_argument("capacitron_use_speaker_embedding", c, restricted=False)
check_argument("capacitron_text_summary_embedding_dim", c, restricted=False, min_val=16, max_val=512)
check_argument("capacitron_VAE_loss_alpha", c, restricted=False)
check_argument("capacitron_grad_clip", c, restricted=False)
@dataclass
class CharactersConfig(Coqpit):
"""Defines arguments for the `BaseCharacters` or `BaseVocabulary` and their subclasses.
Args:
characters_class (str):
Defines the class of the characters used. If None, we pick ```Phonemes``` or ```Graphemes``` based on
the configuration. Defaults to None.
vocab_dict (dict):
Defines the vocabulary dictionary used to encode the characters. Defaults to None.
pad (str):
characters in place of empty padding. Defaults to None.
eos (str):
characters showing the end of a sentence. Defaults to None.
bos (str):
characters showing the beginning of a sentence. Defaults to None.
blank (str):
Optional character used between characters by some models for better prosody. Defaults to `_blank`.
characters (str):
character set used by the model. Characters not in this list are ignored when converting input text to
a list of sequence IDs. Defaults to None.
punctuations (str):
characters considered as punctuation as parsing the input sentence. Defaults to None.
phonemes (str):
characters considered as parsing phonemes. This is only for backwards compat. Use `characters` for new
models. Defaults to None.
is_unique (bool):
remove any duplicate characters in the character lists. It is a bandaid for compatibility with the old
models trained with character lists with duplicates. Defaults to True.
is_sorted (bool):
Sort the characters in alphabetical order. Defaults to True.
"""
characters_class: str = None
# using BaseVocabulary
vocab_dict: Dict = None
# using on BaseCharacters
pad: str = None
eos: str = None
bos: str = None
blank: str = None
characters: str = None
punctuations: str = None
phonemes: str = None
is_unique: bool = True # for backwards compatibility of models trained with char sets with duplicates
is_sorted: bool = True
@dataclass
class BaseTTSConfig(BaseTrainingConfig):
"""Shared parameters among all the tts models.
Args:
audio (BaseAudioConfig):
Audio processor config object instance.
use_phonemes (bool):
enable / disable phoneme use.
phonemizer (str):
Name of the phonemizer to use. If set None, the phonemizer will be selected by `phoneme_language`.
Defaults to None.
phoneme_language (str):
Language code for the phonemizer. You can check the list of supported languages by running
`python TTS/tts/utils/text/phonemizers/__init__.py`. Defaults to None.
compute_input_seq_cache (bool):
enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of
the training, It allows faster data loader time and precise limitation with `max_seq_len` and
`min_seq_len`.
text_cleaner (str):
Name of the text cleaner used for cleaning and formatting transcripts.
enable_eos_bos_chars (bool):
enable / disable the use of eos and bos characters.
test_senteces_file (str):
Path to a txt file that has sentences used at test time. The file must have a sentence per line.
phoneme_cache_path (str):
Path to the output folder caching the computed phonemes for each sample.
characters (CharactersConfig):
Instance of a CharactersConfig class.
batch_group_size (int):
Size of the batch groups used for bucketing. By default, the dataloader orders samples by the sequence
length for a more efficient and stable training. If `batch_group_size > 1` then it performs bucketing to
prevent using the same batches for each epoch.
loss_masking (bool):
enable / disable masking loss values against padded segments of samples in a batch.
min_text_len (int):
Minimum length of input text to be used. All shorter samples will be ignored. Defaults to 0.
max_text_len (int):
Maximum length of input text to be used. All longer samples will be ignored. Defaults to float("inf").
min_audio_len (int):
Minimum length of input audio to be used. All shorter samples will be ignored. Defaults to 0.
max_audio_len (int):
Maximum length of input audio to be used. All longer samples will be ignored. The maximum length in the
dataset defines the VRAM used in the training. Hence, pay attention to this value if you encounter an
OOM error in training. Defaults to float("inf").
compute_f0 (int):
(Not in use yet).
compute_energy (int):
(Not in use yet).
compute_linear_spec (bool):
If True data loader computes and returns linear spectrograms alongside the other data.
precompute_num_workers (int):
Number of workers to precompute features. Defaults to 0.
use_noise_augment (bool):
Augment the input audio with random noise.
start_by_longest (bool):
If True, the data loader will start loading the longest batch first. It is useful for checking OOM issues.
Defaults to False.
shuffle (bool):
If True, the data loader will shuffle the dataset when there is not sampler defined. Defaults to True.
drop_last (bool):
If True, the data loader will drop the last batch if it is not complete. It helps to prevent
issues that emerge from the partial batch statistics. Defaults to True.
add_blank (bool):
Add blank characters between each other two characters. It improves performance for some models at expense
of slower run-time due to the longer input sequence.
datasets (List[BaseDatasetConfig]):
List of datasets used for training. If multiple datasets are provided, they are merged and used together
for training.
optimizer (str):
Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`.
Defaults to ``.
optimizer_params (dict):
Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
lr_scheduler (str):
Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or
`TTS.utils.training`. Defaults to ``.
lr_scheduler_params (dict):
Parameters for the generator learning rate scheduler. Defaults to `{"warmup": 4000}`.
test_sentences (List[str]):
List of sentences to be used at testing. Defaults to '[]'
eval_split_max_size (int):
Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
eval_split_size (float):
If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
use_speaker_weighted_sampler (bool):
Enable / Disable the batch balancer by speaker. Defaults to ```False```.
speaker_weighted_sampler_alpha (float):
Number that control the influence of the speaker sampler weights. Defaults to ```1.0```.
use_language_weighted_sampler (bool):
Enable / Disable the batch balancer by language. Defaults to ```False```.
language_weighted_sampler_alpha (float):
Number that control the influence of the language sampler weights. Defaults to ```1.0```.
use_length_weighted_sampler (bool):
Enable / Disable the batch balancer by audio length. If enabled the dataset will be divided
into 10 buckets considering the min and max audio of the dataset. The sampler weights will be
computed forcing to have the same quantity of data for each bucket in each training batch. Defaults to ```False```.
length_weighted_sampler_alpha (float):
Number that control the influence of the length sampler weights. Defaults to ```1.0```.
"""
audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
# phoneme settings
use_phonemes: bool = False
phonemizer: str = None
phoneme_language: str = None
compute_input_seq_cache: bool = False
text_cleaner: str = None
enable_eos_bos_chars: bool = False
test_sentences_file: str = ""
phoneme_cache_path: str = None
# vocabulary parameters
characters: CharactersConfig = None
add_blank: bool = False
# training params
batch_group_size: int = 0
loss_masking: bool = None
# dataloading
min_audio_len: int = 1
max_audio_len: int = float("inf")
min_text_len: int = 1
max_text_len: int = float("inf")
compute_f0: bool = False
compute_energy: bool = False
compute_linear_spec: bool = False
precompute_num_workers: int = 0
use_noise_augment: bool = False
start_by_longest: bool = False
shuffle: bool = False
drop_last: bool = False
# dataset
datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
# optimizer
optimizer: str = "radam"
optimizer_params: dict = None
# scheduler
lr_scheduler: str = None
lr_scheduler_params: dict = field(default_factory=lambda: {})
# testing
test_sentences: List[str] = field(default_factory=lambda: [])
# evaluation
eval_split_max_size: int = None
eval_split_size: float = 0.01
# weighted samplers
use_speaker_weighted_sampler: bool = False
speaker_weighted_sampler_alpha: float = 1.0
use_language_weighted_sampler: bool = False
language_weighted_sampler_alpha: float = 1.0
use_length_weighted_sampler: bool = False
length_weighted_sampler_alpha: float = 1.0
+194
View File
@@ -0,0 +1,194 @@
from dataclasses import dataclass, field
from typing import List
from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.models.forward_tts import ForwardTTSArgs
@dataclass
class SpeedySpeechConfig(BaseTTSConfig):
"""Configure `ForwardTTS` as SpeedySpeech model.
Example:
>>> from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig
>>> config = SpeedySpeechConfig()
Args:
model (str):
Model name used for selecting the right model at initialization. Defaults to `speedy_speech`.
base_model (str):
Name of the base model being configured as this model so that 🐸 TTS knows it needs to initiate
the base model rather than searching for the `model` implementation. Defaults to `forward_tts`.
model_args (Coqpit):
Model class arguments. Check `FastPitchArgs` for more details. Defaults to `FastPitchArgs()`.
data_dep_init_steps (int):
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
for the rest. Defaults to 10.
speakers_file (str):
Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
speaker names. Defaults to `None`.
use_speaker_embedding (bool):
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
in the multi-speaker mode. Defaults to False.
use_d_vector_file (bool):
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
d_vector_file (str):
Path to the file including pre-computed speaker embeddings. Defaults to None.
d_vector_dim (int):
Dimension of the external speaker embeddings. Defaults to 0.
optimizer (str):
Name of the model optimizer. Defaults to `RAdam`.
optimizer_params (dict):
Arguments of the model optimizer. Defaults to `{"betas": [0.9, 0.998], "weight_decay": 1e-6}`.
lr_scheduler (str):
Name of the learning rate scheduler. Defaults to `Noam`.
lr_scheduler_params (dict):
Arguments of the learning rate scheduler. Defaults to `{"warmup_steps": 4000}`.
lr (float):
Initial learning rate. Defaults to `1e-3`.
grad_clip (float):
Gradient norm clipping value. Defaults to `5.0`.
spec_loss_type (str):
Type of the spectrogram loss. Check `ForwardTTSLoss` for possible values. Defaults to `l1`.
duration_loss_type (str):
Type of the duration loss. Check `ForwardTTSLoss` for possible values. Defaults to `huber`.
use_ssim_loss (bool):
Enable/disable the use of SSIM (Structural Similarity) loss. Defaults to True.
wd (float):
Weight decay coefficient. Defaults to `1e-7`.
ssim_loss_alpha (float):
Weight for the SSIM loss. If set 0, disables the SSIM loss. Defaults to 1.0.
dur_loss_alpha (float):
Weight for the duration predictor's loss. If set 0, disables the huber loss. Defaults to 1.0.
spec_loss_alpha (float):
Weight for the L1 spectrogram loss. If set 0, disables the L1 loss. Defaults to 1.0.
binary_loss_alpha (float):
Weight for the binary loss. If set 0, disables the binary loss. Defaults to 1.0.
binary_loss_warmup_epochs (float):
Number of epochs to gradually increase the binary loss impact. Defaults to 150.
min_seq_len (int):
Minimum input sequence length to be used at training.
max_seq_len (int):
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
"""
model: str = "speedy_speech"
base_model: str = "forward_tts"
# set model args as SpeedySpeech
model_args: ForwardTTSArgs = field(
default_factory=lambda: ForwardTTSArgs(
use_pitch=False,
encoder_type="residual_conv_bn",
encoder_params={
"kernel_size": 4,
"dilations": 4 * [1, 2, 4] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 13,
},
decoder_type="residual_conv_bn",
decoder_params={
"kernel_size": 4,
"dilations": 4 * [1, 2, 4, 8] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 17,
},
out_channels=80,
hidden_channels=128,
positional_encoding=True,
detach_duration_predictor=True,
)
)
# multi-speaker settings
num_speakers: int = 0
speakers_file: str = None
use_speaker_embedding: bool = False
use_d_vector_file: bool = False
d_vector_file: str = False
d_vector_dim: int = 0
# optimizer parameters
optimizer: str = "Adam"
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
lr_scheduler: str = "NoamLR"
lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000})
lr: float = 1e-4
grad_clip: float = 5.0
# loss params
spec_loss_type: str = "l1"
duration_loss_type: str = "huber"
use_ssim_loss: bool = False
ssim_loss_alpha: float = 1.0
dur_loss_alpha: float = 1.0
spec_loss_alpha: float = 1.0
aligner_loss_alpha: float = 1.0
binary_align_loss_alpha: float = 0.3
binary_loss_warmup_epochs: int = 150
# overrides
min_seq_len: int = 13
max_seq_len: int = 200
r: int = 1 # DO NOT CHANGE
# dataset configs
compute_f0: bool = False
f0_cache_path: str = None
# testing
test_sentences: List[str] = field(
default_factory=lambda: [
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
"Be a voice, not an echo.",
"I'm sorry Dave. I'm afraid I can't do that.",
"This cake is great. It's so delicious and moist.",
"Prior to November 22, 1963.",
]
)
def __post_init__(self):
# Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
if self.num_speakers > 0:
self.model_args.num_speakers = self.num_speakers
# speaker embedding settings
if self.use_speaker_embedding:
self.model_args.use_speaker_embedding = True
if self.speakers_file:
self.model_args.speakers_file = self.speakers_file
# d-vector settings
if self.use_d_vector_file:
self.model_args.use_d_vector_file = True
if self.d_vector_dim is not None and self.d_vector_dim > 0:
self.model_args.d_vector_dim = self.d_vector_dim
if self.d_vector_file:
self.model_args.d_vector_file = self.d_vector_file
+21
View File
@@ -0,0 +1,21 @@
from dataclasses import dataclass
from TTS.tts.configs.tacotron_config import TacotronConfig
@dataclass
class Tacotron2Config(TacotronConfig):
"""Defines parameters for Tacotron2 based models.
Example:
>>> from TTS.tts.configs.tacotron2_config import Tacotron2Config
>>> config = Tacotron2Config()
Check `TacotronConfig` for argument descriptions.
"""
model: str = "tacotron2"
out_channels: int = 80
encoder_in_features: int = 512
decoder_in_features: int = 512
+235
View File
@@ -0,0 +1,235 @@
from dataclasses import dataclass, field
from typing import List
from TTS.tts.configs.shared_configs import BaseTTSConfig, CapacitronVAEConfig, GSTConfig
@dataclass
class TacotronConfig(BaseTTSConfig):
"""Defines parameters for Tacotron based models.
Example:
>>> from TTS.tts.configs.tacotron_config import TacotronConfig
>>> config = TacotronConfig()
Args:
model (str):
Model name used to select the right model class to initilize. Defaults to `Tacotron`.
use_gst (bool):
enable / disable the use of Global Style Token modules. Defaults to False.
gst (GSTConfig):
Instance of `GSTConfig` class.
gst_style_input (str):
Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
this is not defined, the model uses a zero vector as an input. Defaults to None.
use_capacitron_vae (bool):
enable / disable the use of Capacitron modules. Defaults to False.
capacitron_vae (CapacitronConfig):
Instance of `CapacitronConfig` class.
num_chars (int):
Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
num_speakers (int):
Number of speakers for multi-speaker models. Defaults to 1.
r (int):
Initial number of output frames that the decoder computed per iteration. Larger values makes training and inference
faster but reduces the quality of the output frames. This must be equal to the largest `r` value used in
`gradual_training` schedule. Defaults to 1.
gradual_training (List[List]):
Parameters for the gradual training schedule. It is in the form `[[a, b, c], [d ,e ,f] ..]` where `a` is
the step number to start using the rest of the values, `b` is the `r` value and `c` is the batch size.
If sets None, no gradual training is used. Defaults to None.
memory_size (int):
Defines the number of previous frames used by the Prenet. If set to < 0, then it uses only the last frame.
Defaults to -1.
prenet_type (str):
`original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
Prenet. Defaults to `original`.
prenet_dropout (bool):
enables / disables the use of dropout in the Prenet. Defaults to True.
prenet_dropout_at_inference (bool):
enable / disable the use of dropout in the Prenet at the inference time. Defaults to False.
stopnet (bool):
enable /disable the Stopnet that predicts the end of the decoder sequence. Defaults to True.
stopnet_pos_weight (float):
Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with
datasets with longer sentences. Defaults to 0.2.
max_decoder_steps (int):
Max number of steps allowed for the decoder. Defaults to 50.
encoder_in_features (int):
Channels of encoder input and character embedding tensors. Defaults to 256.
decoder_in_features (int):
Channels of decoder input and encoder output tensors. Defaults to 256.
out_channels (int):
Channels of the final model output. It must match the spectragram size. Defaults to 80.
separate_stopnet (bool):
Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True.
attention_type (str):
attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'.
attention_heads (int):
Number of attention heads for GMM attention. Defaults to 5.
windowing (bool):
It especially useful at inference to keep attention alignment diagonal. Defaults to False.
use_forward_attn (bool):
It is only valid if ```attn_type``` is ```original```. Defaults to False.
forward_attn_mask (bool):
enable/disable extra masking over forward attention. It is useful at inference to prevent
possible attention failures. Defaults to False.
transition_agent (bool):
enable/disable transition agent in forward attention. Defaults to False.
location_attn (bool):
enable/disable location sensitive attention as in the original Tacotron2 paper.
It is only valid if ```attn_type``` is ```original```. Defaults to True.
bidirectional_decoder (bool):
enable/disable bidirectional decoding. Defaults to False.
double_decoder_consistency (bool):
enable/disable double decoder consistency. Defaults to False.
ddc_r (int):
reduction rate used by the coarse decoder when `double_decoder_consistency` is in use. Set this
as a multiple of the `r` value. Defaults to 6.
speakers_file (str):
Path to the speaker mapping file for the Speaker Manager. Defaults to None.
use_speaker_embedding (bool):
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
in the multi-speaker mode. Defaults to False.
use_d_vector_file (bool):
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
d_vector_file (str):
Path to the file including pre-computed speaker embeddings. Defaults to None.
optimizer (str):
Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`.
Defaults to `RAdam`.
optimizer_params (dict):
Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
lr_scheduler (str):
Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or
`TTS.utils.training`. Defaults to `NoamLR`.
lr_scheduler_params (dict):
Parameters for the generator learning rate scheduler. Defaults to `{"warmup": 4000}`.
lr (float):
Initial learning rate. Defaults to `1e-4`.
wd (float):
Weight decay coefficient. Defaults to `1e-6`.
grad_clip (float):
Gradient clipping threshold. Defaults to `5`.
seq_len_norm (bool):
enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample
is divided by the sequence length. Defaults to False.
loss_masking (bool):
enable / disable masking the paddings of the samples in loss computation. Defaults to True.
decoder_loss_alpha (float):
Weight for the decoder loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
postnet_loss_alpha (float):
Weight for the postnet loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
postnet_diff_spec_alpha (float):
Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
decoder_diff_spec_alpha (float):
Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
decoder_ssim_alpha (float):
Weight for the decoder SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
postnet_ssim_alpha (float):
Weight for the postnet SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
ga_alpha (float):
Weight for the guided attention loss. If set less than or equal to zero, it disables the corresponding loss
function. Defaults to 5.
"""
model: str = "tacotron"
# model_params: TacotronArgs = field(default_factory=lambda: TacotronArgs())
use_gst: bool = False
gst: GSTConfig = None
gst_style_input: str = None
use_capacitron_vae: bool = False
capacitron_vae: CapacitronVAEConfig = None
# model specific params
num_speakers: int = 1
num_chars: int = 0
r: int = 2
gradual_training: List[List[int]] = None
memory_size: int = -1
prenet_type: str = "original"
prenet_dropout: bool = True
prenet_dropout_at_inference: bool = False
stopnet: bool = True
separate_stopnet: bool = True
stopnet_pos_weight: float = 0.2
max_decoder_steps: int = 10000
encoder_in_features: int = 256
decoder_in_features: int = 256
decoder_output_dim: int = 80
out_channels: int = 513
# attention layers
attention_type: str = "original"
attention_heads: int = None
attention_norm: str = "sigmoid"
attention_win: bool = False
windowing: bool = False
use_forward_attn: bool = False
forward_attn_mask: bool = False
transition_agent: bool = False
location_attn: bool = True
# advance methods
bidirectional_decoder: bool = False
double_decoder_consistency: bool = False
ddc_r: int = 6
# multi-speaker settings
speakers_file: str = None
use_speaker_embedding: bool = False
speaker_embedding_dim: int = 512
use_d_vector_file: bool = False
d_vector_file: str = False
d_vector_dim: int = None
# optimizer parameters
optimizer: str = "RAdam"
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
lr_scheduler: str = "NoamLR"
lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000})
lr: float = 1e-4
grad_clip: float = 5.0
seq_len_norm: bool = False
loss_masking: bool = True
# loss params
decoder_loss_alpha: float = 0.25
postnet_loss_alpha: float = 0.25
postnet_diff_spec_alpha: float = 0.25
decoder_diff_spec_alpha: float = 0.25
decoder_ssim_alpha: float = 0.25
postnet_ssim_alpha: float = 0.25
ga_alpha: float = 5.0
# testing
test_sentences: List[str] = field(
default_factory=lambda: [
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
"Be a voice, not an echo.",
"I'm sorry Dave. I'm afraid I can't do that.",
"This cake is great. It's so delicious and moist.",
"Prior to November 22, 1963.",
]
)
def check_values(self):
if self.gradual_training:
assert (
self.gradual_training[0][1] == self.r
), f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}"
if self.model == "tacotron" and self.audio is not None:
assert self.out_channels == (
self.audio.fft_size // 2 + 1
), f"{self.out_channels} vs {self.audio.fft_size // 2 + 1}"
if self.model == "tacotron2" and self.audio is not None:
assert self.out_channels == self.audio.num_mels
+87
View File
@@ -0,0 +1,87 @@
from dataclasses import dataclass, field
from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.models.tortoise import TortoiseArgs, TortoiseAudioConfig
@dataclass
class TortoiseConfig(BaseTTSConfig):
"""Defines parameters for Tortoise TTS model.
Args:
model (str):
Model name. Do not change unless you know what you are doing.
model_args (TortoiseArgs):
Model architecture arguments. Defaults to `TortoiseArgs()`.
audio (TortoiseAudioConfig):
Audio processing configuration. Defaults to `TortoiseAudioConfig()`.
model_dir (str):
Path to the folder that has all the Tortoise models. Defaults to None.
temperature (float):
Temperature for the autoregressive model inference. Larger values makes predictions more creative sacrificing stability. Defaults to `0.2`.
length_penalty (float):
Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length,
which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative),
length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences.
reperation_penalty (float):
The parameter for repetition penalty. 1.0 means no penalty. Defaults to `2.0`.
top_p (float):
If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
Defaults to `0.8`.
cond_free_k (float):
Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k. Defaults to `2.0`.
diffusion_temperature (float):
Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
are the "mean" prediction of the diffusion network and will sound bland and smeared.
Defaults to `1.0`.
num_autoregressive_samples (int):
Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
As Tortoise is a probabilistic model, more samples means a higher probability of creating something "great".
Defaults to `16`.
diffusion_iterations (int):
Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine
the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better,
however. Defaults to `30`.
sampler (str):
Diffusion sampler to be used. `ddim` or `dpm++2m`. Defaults to `ddim`.
Note:
Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
Example:
>>> from TTS.tts.configs.tortoise_config import TortoiseConfig
>>> config = TortoiseConfig()
"""
model: str = "tortoise"
# model specific params
model_args: TortoiseArgs = field(default_factory=TortoiseArgs)
audio: TortoiseAudioConfig = field(default_factory=TortoiseAudioConfig)
model_dir: str = None
# settings
temperature: float = 0.2
length_penalty: float = 1.0
repetition_penalty: float = 2.0
top_p: float = 0.8
cond_free_k: float = 2.0
diffusion_temperature: float = 1.0
# inference params
num_autoregressive_samples: int = 16
diffusion_iterations: int = 30
sampler: str = "ddim"
+176
View File
@@ -0,0 +1,176 @@
from dataclasses import dataclass, field
from typing import List
from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.models.vits import VitsArgs, VitsAudioConfig
@dataclass
class VitsConfig(BaseTTSConfig):
"""Defines parameters for VITS End2End TTS model.
Args:
model (str):
Model name. Do not change unless you know what you are doing.
model_args (VitsArgs):
Model architecture arguments. Defaults to `VitsArgs()`.
audio (VitsAudioConfig):
Audio processing configuration. Defaults to `VitsAudioConfig()`.
grad_clip (List):
Gradient clipping thresholds for each optimizer. Defaults to `[1000.0, 1000.0]`.
lr_gen (float):
Initial learning rate for the generator. Defaults to 0.0002.
lr_disc (float):
Initial learning rate for the discriminator. Defaults to 0.0002.
lr_scheduler_gen (str):
Name of the learning rate scheduler for the generator. One of the `torch.optim.lr_scheduler.*`. Defaults to
`ExponentialLR`.
lr_scheduler_gen_params (dict):
Parameters for the learning rate scheduler of the generator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
lr_scheduler_disc (str):
Name of the learning rate scheduler for the discriminator. One of the `torch.optim.lr_scheduler.*`. Defaults to
`ExponentialLR`.
lr_scheduler_disc_params (dict):
Parameters for the learning rate scheduler of the discriminator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
scheduler_after_epoch (bool):
If true, step the schedulers after each epoch else after each step. Defaults to `False`.
optimizer (str):
Name of the optimizer to use with both the generator and the discriminator networks. One of the
`torch.optim.*`. Defaults to `AdamW`.
kl_loss_alpha (float):
Loss weight for KL loss. Defaults to 1.0.
disc_loss_alpha (float):
Loss weight for the discriminator loss. Defaults to 1.0.
gen_loss_alpha (float):
Loss weight for the generator loss. Defaults to 1.0.
feat_loss_alpha (float):
Loss weight for the feature matching loss. Defaults to 1.0.
mel_loss_alpha (float):
Loss weight for the mel loss. Defaults to 45.0.
return_wav (bool):
If true, data loader returns the waveform as well as the other outputs. Do not change. Defaults to `True`.
compute_linear_spec (bool):
If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`.
use_weighted_sampler (bool):
If true, use weighted sampler with bucketing for balancing samples between datasets used in training. Defaults to `False`.
weighted_sampler_attrs (dict):
Key retuned by the formatter to be used for weighted sampler. For example `{"root_path": 2.0, "speaker_name": 1.0}` sets sample probabilities
by overweighting `root_path` by 2.0. Defaults to `{}`.
weighted_sampler_multipliers (dict):
Weight each unique value of a key returned by the formatter for weighted sampling.
For example `{"root_path":{"/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-100/":1.0, "/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-360/": 0.5}`.
It will sample instances from `train-clean-100` 2 times more than `train-clean-360`. Defaults to `{}`.
r (int):
Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`.
add_blank (bool):
If true, a blank token is added in between every character. Defaults to `True`.
test_sentences (List[List]):
List of sentences with speaker and language information to be used for testing.
language_ids_file (str):
Path to the language ids file.
use_language_embedding (bool):
If true, language embedding is used. Defaults to `False`.
Note:
Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
Example:
>>> from TTS.tts.configs.vits_config import VitsConfig
>>> config = VitsConfig()
"""
model: str = "vits"
# model specific params
model_args: VitsArgs = field(default_factory=VitsArgs)
audio: VitsAudioConfig = field(default_factory=VitsAudioConfig)
# optimizer
grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
lr_gen: float = 0.0002
lr_disc: float = 0.0002
lr_scheduler_gen: str = "ExponentialLR"
lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
lr_scheduler_disc: str = "ExponentialLR"
lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
scheduler_after_epoch: bool = True
optimizer: str = "AdamW"
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "eps": 1e-9, "weight_decay": 0.01})
# loss params
kl_loss_alpha: float = 1.0
disc_loss_alpha: float = 1.0
gen_loss_alpha: float = 1.0
feat_loss_alpha: float = 1.0
mel_loss_alpha: float = 45.0
dur_loss_alpha: float = 1.0
speaker_encoder_loss_alpha: float = 1.0
# data loader params
return_wav: bool = True
compute_linear_spec: bool = True
# sampler params
use_weighted_sampler: bool = False # TODO: move it to the base config
weighted_sampler_attrs: dict = field(default_factory=lambda: {})
weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
# overrides
r: int = 1 # DO NOT CHANGE
add_blank: bool = True
# testing
test_sentences: List[List] = field(
default_factory=lambda: [
["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
["Be a voice, not an echo."],
["I'm sorry Dave. I'm afraid I can't do that."],
["This cake is great. It's so delicious and moist."],
["Prior to November 22, 1963."],
]
)
# multi-speaker settings
# use speaker embedding layer
num_speakers: int = 0
use_speaker_embedding: bool = False
speakers_file: str = None
speaker_embedding_channels: int = 256
language_ids_file: str = None
use_language_embedding: bool = False
# use d-vectors
use_d_vector_file: bool = False
d_vector_file: List[str] = None
d_vector_dim: int = None
def __post_init__(self):
for key, val in self.model_args.items():
if hasattr(self, key):
self[key] = val
+107
View File
@@ -0,0 +1,107 @@
from dataclasses import dataclass, field
from typing import List
from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.models.xtts import XttsArgs, XttsAudioConfig
@dataclass
class XttsConfig(BaseTTSConfig):
"""Defines parameters for XTTS TTS model.
Args:
model (str):
Model name. Do not change unless you know what you are doing.
model_args (XttsArgs):
Model architecture arguments. Defaults to `XttsArgs()`.
audio (XttsAudioConfig):
Audio processing configuration. Defaults to `XttsAudioConfig()`.
model_dir (str):
Path to the folder that has all the XTTS models. Defaults to None.
temperature (float):
Temperature for the autoregressive model inference. Larger values makes predictions more creative sacrificing stability. Defaults to `0.2`.
length_penalty (float):
Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length,
which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative),
length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences.
repetition_penalty (float):
The parameter for repetition penalty. 1.0 means no penalty. Defaults to `2.0`.
top_p (float):
If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
Defaults to `0.8`.
num_gpt_outputs (int):
Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
As XTTS is a probabilistic model, more samples means a higher probability of creating something "great".
Defaults to `16`.
gpt_cond_len (int):
Secs audio to be used as conditioning for the autoregressive model. Defaults to `12`.
gpt_cond_chunk_len (int):
Audio chunk size in secs. Audio is split into chunks and latents are extracted for each chunk. Then the
latents are averaged. Chunking improves the stability. It must be <= gpt_cond_len.
If gpt_cond_len == gpt_cond_chunk_len, no chunking. Defaults to `4`.
max_ref_len (int):
Maximum number of seconds of audio to be used as conditioning for the decoder. Defaults to `10`.
sound_norm_refs (bool):
Whether to normalize the conditioning audio. Defaults to `False`.
Note:
Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
Example:
>>> from TTS.tts.configs.xtts_config import XttsConfig
>>> config = XttsConfig()
"""
model: str = "xtts"
# model specific params
model_args: XttsArgs = field(default_factory=XttsArgs)
audio: XttsAudioConfig = field(default_factory=XttsAudioConfig)
model_dir: str = None
languages: List[str] = field(
default_factory=lambda: [
"en",
"es",
"fr",
"de",
"it",
"pt",
"pl",
"tr",
"ru",
"nl",
"cs",
"ar",
"zh-cn",
"hu",
"ko",
"ja",
"hi",
]
)
# inference params
temperature: float = 0.85
length_penalty: float = 1.0
repetition_penalty: float = 2.0
top_k: int = 50
top_p: float = 0.85
num_gpt_outputs: int = 1
# cloning
gpt_cond_len: int = 12
gpt_cond_chunk_len: int = 4
max_ref_len: int = 10
sound_norm_refs: bool = False
+181
View File
@@ -0,0 +1,181 @@
import os
import sys
from collections import Counter
from pathlib import Path
from typing import Callable, Dict, List, Tuple, Union
import numpy as np
from TTS.tts.datasets.dataset import *
from TTS.tts.datasets.formatters import *
def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
"""Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.
Args:
items (List[List]):
A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`.
eval_split_max_size (int):
Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
eval_split_size (float):
If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
"""
speakers = [item["speaker_name"] for item in items]
is_multi_speaker = len(set(speakers)) > 1
if eval_split_size > 1:
eval_split_size = int(eval_split_size)
else:
if eval_split_max_size:
eval_split_size = min(eval_split_max_size, int(len(items) * eval_split_size))
else:
eval_split_size = int(len(items) * eval_split_size)
assert (
eval_split_size > 0
), " [!] You do not have enough samples for the evaluation set. You can work around this setting the 'eval_split_size' parameter to a minimum of {}".format(
1 / len(items)
)
np.random.seed(0)
np.random.shuffle(items)
if is_multi_speaker:
items_eval = []
speakers = [item["speaker_name"] for item in items]
speaker_counter = Counter(speakers)
while len(items_eval) < eval_split_size:
item_idx = np.random.randint(0, len(items))
speaker_to_be_removed = items[item_idx]["speaker_name"]
if speaker_counter[speaker_to_be_removed] > 1:
items_eval.append(items[item_idx])
speaker_counter[speaker_to_be_removed] -= 1
del items[item_idx]
return items_eval, items
return items[:eval_split_size], items[eval_split_size:]
def add_extra_keys(metadata, language, dataset_name):
for item in metadata:
# add language name
item["language"] = language
# add unique audio name
relfilepath = os.path.splitext(os.path.relpath(item["audio_file"], item["root_path"]))[0]
audio_unique_name = f"{dataset_name}#{relfilepath}"
item["audio_unique_name"] = audio_unique_name
return metadata
def load_tts_samples(
datasets: Union[List[Dict], Dict],
eval_split=True,
formatter: Callable = None,
eval_split_max_size=None,
eval_split_size=0.01,
) -> Tuple[List[List], List[List]]:
"""Parse the dataset from the datasets config, load the samples as a List and load the attention alignments if provided.
If `formatter` is not None, apply the formatter to the samples else pick the formatter from the available ones based
on the dataset name.
Args:
datasets (List[Dict], Dict): A list of datasets or a single dataset dictionary. If multiple datasets are
in the list, they are all merged.
eval_split (bool, optional): If true, create a evaluation split. If an eval split provided explicitly, generate
an eval split automatically. Defaults to True.
formatter (Callable, optional): The preprocessing function to be applied to create the list of samples. It
must take the root_path and the meta_file name and return a list of samples in the format of
`[[text, audio_path, speaker_id], ...]]`. See the available formatters in `TTS.tts.dataset.formatter` as
example. Defaults to None.
eval_split_max_size (int):
Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
eval_split_size (float):
If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
Returns:
Tuple[List[List], List[List]: training and evaluation splits of the dataset.
"""
meta_data_train_all = []
meta_data_eval_all = [] if eval_split else None
if not isinstance(datasets, list):
datasets = [datasets]
for dataset in datasets:
formatter_name = dataset["formatter"]
dataset_name = dataset["dataset_name"]
root_path = dataset["path"]
meta_file_train = dataset["meta_file_train"]
meta_file_val = dataset["meta_file_val"]
ignored_speakers = dataset["ignored_speakers"]
language = dataset["language"]
# setup the right data processor
if formatter is None:
formatter = _get_formatter_by_name(formatter_name)
# load train set
meta_data_train = formatter(root_path, meta_file_train, ignored_speakers=ignored_speakers)
assert len(meta_data_train) > 0, f" [!] No training samples found in {root_path}/{meta_file_train}"
meta_data_train = add_extra_keys(meta_data_train, language, dataset_name)
print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}")
# load evaluation split if set
if eval_split:
if meta_file_val:
meta_data_eval = formatter(root_path, meta_file_val, ignored_speakers=ignored_speakers)
meta_data_eval = add_extra_keys(meta_data_eval, language, dataset_name)
else:
eval_size_per_dataset = eval_split_max_size // len(datasets) if eval_split_max_size else None
meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_size_per_dataset, eval_split_size)
meta_data_eval_all += meta_data_eval
meta_data_train_all += meta_data_train
# load attention masks for the duration predictor training
if dataset.meta_file_attn_mask:
meta_data = dict(load_attention_mask_meta_data(dataset["meta_file_attn_mask"]))
for idx, ins in enumerate(meta_data_train_all):
attn_file = meta_data[ins["audio_file"]].strip()
meta_data_train_all[idx].update({"alignment_file": attn_file})
if meta_data_eval_all:
for idx, ins in enumerate(meta_data_eval_all):
attn_file = meta_data[ins["audio_file"]].strip()
meta_data_eval_all[idx].update({"alignment_file": attn_file})
# set none for the next iter
formatter = None
return meta_data_train_all, meta_data_eval_all
def load_attention_mask_meta_data(metafile_path):
"""Load meta data file created by compute_attention_masks.py"""
with open(metafile_path, "r", encoding="utf-8") as f:
lines = f.readlines()
meta_data = []
for line in lines:
wav_file, attn_file = line.split("|")
meta_data.append([wav_file, attn_file])
return meta_data
def _get_formatter_by_name(name):
"""Returns the respective preprocessing function."""
thismodule = sys.modules[__name__]
return getattr(thismodule, name.lower())
def find_unique_chars(data_samples, verbose=True):
texts = "".join(item[0] for item in data_samples)
chars = set(texts)
lower_chars = filter(lambda c: c.islower(), chars)
chars_force_lower = [c.lower() for c in chars]
chars_force_lower = set(chars_force_lower)
if verbose:
print(f" > Number of unique characters: {len(chars)}")
print(f" > Unique characters: {''.join(sorted(chars))}")
print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
return chars_force_lower
+973
View File
@@ -0,0 +1,973 @@
import base64
import collections
import os
import random
from typing import Dict, List, Union
import numpy as np
import torch
import tqdm
from torch.utils.data import Dataset
from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor
from TTS.utils.audio import AudioProcessor
from TTS.utils.audio.numpy_transforms import compute_energy as calculate_energy
import mutagen
# to prevent too many open files error as suggested here
# https://github.com/pytorch/pytorch/issues/11201#issuecomment-421146936
torch.multiprocessing.set_sharing_strategy("file_system")
def _parse_sample(item):
language_name = None
attn_file = None
if len(item) == 5:
text, wav_file, speaker_name, language_name, attn_file = item
elif len(item) == 4:
text, wav_file, speaker_name, language_name = item
elif len(item) == 3:
text, wav_file, speaker_name = item
else:
raise ValueError(" [!] Dataset cannot parse the sample.")
return text, wav_file, speaker_name, language_name, attn_file
def noise_augment_audio(wav):
return wav + (1.0 / 32768.0) * np.random.rand(*wav.shape)
def string2filename(string):
# generate a safe and reversible filename based on a string
filename = base64.urlsafe_b64encode(string.encode("utf-8")).decode("utf-8", "ignore")
return filename
def get_audio_size(audiopath):
extension = audiopath.rpartition(".")[-1].lower()
if extension not in {"mp3", "wav", "flac"}:
raise RuntimeError(f"The audio format {extension} is not supported, please convert the audio files to mp3, flac, or wav format!")
audio_info = mutagen.File(audiopath).info
return int(audio_info.length * audio_info.sample_rate)
class TTSDataset(Dataset):
def __init__(
self,
outputs_per_step: int = 1,
compute_linear_spec: bool = False,
ap: AudioProcessor = None,
samples: List[Dict] = None,
tokenizer: "TTSTokenizer" = None,
compute_f0: bool = False,
compute_energy: bool = False,
f0_cache_path: str = None,
energy_cache_path: str = None,
return_wav: bool = False,
batch_group_size: int = 0,
min_text_len: int = 0,
max_text_len: int = float("inf"),
min_audio_len: int = 0,
max_audio_len: int = float("inf"),
phoneme_cache_path: str = None,
precompute_num_workers: int = 0,
speaker_id_mapping: Dict = None,
d_vector_mapping: Dict = None,
language_id_mapping: Dict = None,
use_noise_augment: bool = False,
start_by_longest: bool = False,
verbose: bool = False,
):
"""Generic 📂 data loader for `tts` models. It is configurable for different outputs and needs.
If you need something different, you can subclass and override.
Args:
outputs_per_step (int): Number of time frames predicted per step.
compute_linear_spec (bool): compute linear spectrogram if True.
ap (TTS.tts.utils.AudioProcessor): Audio processor object.
samples (list): List of dataset samples.
tokenizer (TTSTokenizer): tokenizer to convert text to sequence IDs. If None init internally else
use the given. Defaults to None.
compute_f0 (bool): compute f0 if True. Defaults to False.
compute_energy (bool): compute energy if True. Defaults to False.
f0_cache_path (str): Path to store f0 cache. Defaults to None.
energy_cache_path (str): Path to store energy cache. Defaults to None.
return_wav (bool): Return the waveform of the sample. Defaults to False.
batch_group_size (int): Range of batch randomization after sorting
sequences by length. It shuffles each batch with bucketing to gather similar lenght sequences in a
batch. Set 0 to disable. Defaults to 0.
min_text_len (int): Minimum length of input text to be used. All shorter samples will be ignored.
Defaults to 0.
max_text_len (int): Maximum length of input text to be used. All longer samples will be ignored.
Defaults to float("inf").
min_audio_len (int): Minimum length of input audio to be used. All shorter samples will be ignored.
Defaults to 0.
max_audio_len (int): Maximum length of input audio to be used. All longer samples will be ignored.
The maximum length in the dataset defines the VRAM used in the training. Hence, pay attention to
this value if you encounter an OOM error in training. Defaults to float("inf").
phoneme_cache_path (str): Path to cache computed phonemes. It writes phonemes of each sample to a
separate file. Defaults to None.
precompute_num_workers (int): Number of workers to precompute features. Defaults to 0.
speaker_id_mapping (dict): Mapping of speaker names to IDs used to compute embedding vectors by the
embedding layer. Defaults to None.
d_vector_mapping (dict): Mapping of wav files to computed d-vectors. Defaults to None.
use_noise_augment (bool): Enable adding random noise to wav for augmentation. Defaults to False.
start_by_longest (bool): Start by longest sequence. It is especially useful to check OOM. Defaults to False.
verbose (bool): Print diagnostic information. Defaults to false.
"""
super().__init__()
self.batch_group_size = batch_group_size
self._samples = samples
self.outputs_per_step = outputs_per_step
self.compute_linear_spec = compute_linear_spec
self.return_wav = return_wav
self.compute_f0 = compute_f0
self.compute_energy = compute_energy
self.f0_cache_path = f0_cache_path
self.energy_cache_path = energy_cache_path
self.min_audio_len = min_audio_len
self.max_audio_len = max_audio_len
self.min_text_len = min_text_len
self.max_text_len = max_text_len
self.ap = ap
self.phoneme_cache_path = phoneme_cache_path
self.speaker_id_mapping = speaker_id_mapping
self.d_vector_mapping = d_vector_mapping
self.language_id_mapping = language_id_mapping
self.use_noise_augment = use_noise_augment
self.start_by_longest = start_by_longest
self.verbose = verbose
self.rescue_item_idx = 1
self.pitch_computed = False
self.tokenizer = tokenizer
if self.tokenizer.use_phonemes:
self.phoneme_dataset = PhonemeDataset(
self.samples, self.tokenizer, phoneme_cache_path, precompute_num_workers=precompute_num_workers
)
if compute_f0:
self.f0_dataset = F0Dataset(
self.samples, self.ap, cache_path=f0_cache_path, precompute_num_workers=precompute_num_workers
)
if compute_energy:
self.energy_dataset = EnergyDataset(
self.samples, self.ap, cache_path=energy_cache_path, precompute_num_workers=precompute_num_workers
)
if self.verbose:
self.print_logs()
@property
def lengths(self):
lens = []
for item in self.samples:
_, wav_file, *_ = _parse_sample(item)
audio_len = get_audio_size(wav_file)
lens.append(audio_len)
return lens
@property
def samples(self):
return self._samples
@samples.setter
def samples(self, new_samples):
self._samples = new_samples
if hasattr(self, "f0_dataset"):
self.f0_dataset.samples = new_samples
if hasattr(self, "energy_dataset"):
self.energy_dataset.samples = new_samples
if hasattr(self, "phoneme_dataset"):
self.phoneme_dataset.samples = new_samples
def __len__(self):
return len(self.samples)
def __getitem__(self, idx):
return self.load_data(idx)
def print_logs(self, level: int = 0) -> None:
indent = "\t" * level
print("\n")
print(f"{indent}> DataLoader initialization")
print(f"{indent}| > Tokenizer:")
self.tokenizer.print_logs(level + 1)
print(f"{indent}| > Number of instances : {len(self.samples)}")
def load_wav(self, filename):
waveform = self.ap.load_wav(filename)
assert waveform.size > 0
return waveform
def get_phonemes(self, idx, text):
out_dict = self.phoneme_dataset[idx]
assert text == out_dict["text"], f"{text} != {out_dict['text']}"
assert len(out_dict["token_ids"]) > 0
return out_dict
def get_f0(self, idx):
out_dict = self.f0_dataset[idx]
item = self.samples[idx]
assert item["audio_unique_name"] == out_dict["audio_unique_name"]
return out_dict
def get_energy(self, idx):
out_dict = self.energy_dataset[idx]
item = self.samples[idx]
assert item["audio_unique_name"] == out_dict["audio_unique_name"]
return out_dict
@staticmethod
def get_attn_mask(attn_file):
return np.load(attn_file)
def get_token_ids(self, idx, text):
if self.tokenizer.use_phonemes:
token_ids = self.get_phonemes(idx, text)["token_ids"]
else:
token_ids = self.tokenizer.text_to_ids(text)
return np.array(token_ids, dtype=np.int32)
def load_data(self, idx):
item = self.samples[idx]
raw_text = item["text"]
wav = np.asarray(self.load_wav(item["audio_file"]), dtype=np.float32)
# apply noise for augmentation
if self.use_noise_augment:
wav = noise_augment_audio(wav)
# get token ids
token_ids = self.get_token_ids(idx, item["text"])
# get pre-computed attention maps
attn = None
if "alignment_file" in item:
attn = self.get_attn_mask(item["alignment_file"])
# after phonemization the text length may change
# this is a shareful 🤭 hack to prevent longer phonemes
# TODO: find a better fix
if len(token_ids) > self.max_text_len or len(wav) < self.min_audio_len:
self.rescue_item_idx += 1
return self.load_data(self.rescue_item_idx)
# get f0 values
f0 = None
if self.compute_f0:
f0 = self.get_f0(idx)["f0"]
energy = None
if self.compute_energy:
energy = self.get_energy(idx)["energy"]
sample = {
"raw_text": raw_text,
"token_ids": token_ids,
"wav": wav,
"pitch": f0,
"energy": energy,
"attn": attn,
"item_idx": item["audio_file"],
"speaker_name": item["speaker_name"],
"language_name": item["language"],
"wav_file_name": os.path.basename(item["audio_file"]),
"audio_unique_name": item["audio_unique_name"],
}
return sample
@staticmethod
def _compute_lengths(samples):
new_samples = []
for item in samples:
audio_length = get_audio_size(item["audio_file"])
text_lenght = len(item["text"])
item["audio_length"] = audio_length
item["text_length"] = text_lenght
new_samples += [item]
return new_samples
@staticmethod
def filter_by_length(lengths: List[int], min_len: int, max_len: int):
idxs = np.argsort(lengths) # ascending order
ignore_idx = []
keep_idx = []
for idx in idxs:
length = lengths[idx]
if length < min_len or length > max_len:
ignore_idx.append(idx)
else:
keep_idx.append(idx)
return ignore_idx, keep_idx
@staticmethod
def sort_by_length(samples: List[List]):
audio_lengths = [s["audio_length"] for s in samples]
idxs = np.argsort(audio_lengths) # ascending order
return idxs
@staticmethod
def create_buckets(samples, batch_group_size: int):
assert batch_group_size > 0
for i in range(len(samples) // batch_group_size):
offset = i * batch_group_size
end_offset = offset + batch_group_size
temp_items = samples[offset:end_offset]
random.shuffle(temp_items)
samples[offset:end_offset] = temp_items
return samples
@staticmethod
def _select_samples_by_idx(idxs, samples):
samples_new = []
for idx in idxs:
samples_new.append(samples[idx])
return samples_new
def preprocess_samples(self):
r"""Sort `items` based on text length or audio length in ascending order. Filter out samples out or the length
range.
"""
samples = self._compute_lengths(self.samples)
# sort items based on the sequence length in ascending order
text_lengths = [i["text_length"] for i in samples]
audio_lengths = [i["audio_length"] for i in samples]
text_ignore_idx, text_keep_idx = self.filter_by_length(text_lengths, self.min_text_len, self.max_text_len)
audio_ignore_idx, audio_keep_idx = self.filter_by_length(audio_lengths, self.min_audio_len, self.max_audio_len)
keep_idx = list(set(audio_keep_idx) & set(text_keep_idx))
ignore_idx = list(set(audio_ignore_idx) | set(text_ignore_idx))
samples = self._select_samples_by_idx(keep_idx, samples)
sorted_idxs = self.sort_by_length(samples)
if self.start_by_longest:
longest_idxs = sorted_idxs[-1]
sorted_idxs[-1] = sorted_idxs[0]
sorted_idxs[0] = longest_idxs
samples = self._select_samples_by_idx(sorted_idxs, samples)
if len(samples) == 0:
raise RuntimeError(" [!] No samples left")
# shuffle batch groups
# create batches with similar length items
# the larger the `batch_group_size`, the higher the length variety in a batch.
if self.batch_group_size > 0:
samples = self.create_buckets(samples, self.batch_group_size)
# update items to the new sorted items
audio_lengths = [s["audio_length"] for s in samples]
text_lengths = [s["text_length"] for s in samples]
self.samples = samples
if self.verbose:
print(" | > Preprocessing samples")
print(" | > Max text length: {}".format(np.max(text_lengths)))
print(" | > Min text length: {}".format(np.min(text_lengths)))
print(" | > Avg text length: {}".format(np.mean(text_lengths)))
print(" | ")
print(" | > Max audio length: {}".format(np.max(audio_lengths)))
print(" | > Min audio length: {}".format(np.min(audio_lengths)))
print(" | > Avg audio length: {}".format(np.mean(audio_lengths)))
print(f" | > Num. instances discarded samples: {len(ignore_idx)}")
print(" | > Batch group size: {}.".format(self.batch_group_size))
@staticmethod
def _sort_batch(batch, text_lengths):
"""Sort the batch by the input text length for RNN efficiency.
Args:
batch (Dict): Batch returned by `__getitem__`.
text_lengths (List[int]): Lengths of the input character sequences.
"""
text_lengths, ids_sorted_decreasing = torch.sort(torch.LongTensor(text_lengths), dim=0, descending=True)
batch = [batch[idx] for idx in ids_sorted_decreasing]
return batch, text_lengths, ids_sorted_decreasing
def collate_fn(self, batch):
r"""
Perform preprocessing and create a final data batch:
1. Sort batch instances by text-length
2. Convert Audio signal to features.
3. PAD sequences wrt r.
4. Load to Torch.
"""
# Puts each data field into a tensor with outer dimension batch size
if isinstance(batch[0], collections.abc.Mapping):
token_ids_lengths = np.array([len(d["token_ids"]) for d in batch])
# sort items with text input length for RNN efficiency
batch, token_ids_lengths, ids_sorted_decreasing = self._sort_batch(batch, token_ids_lengths)
# convert list of dicts to dict of lists
batch = {k: [dic[k] for dic in batch] for k in batch[0]}
# get language ids from language names
if self.language_id_mapping is not None:
language_ids = [self.language_id_mapping[ln] for ln in batch["language_name"]]
else:
language_ids = None
# get pre-computed d-vectors
if self.d_vector_mapping is not None:
embedding_keys = list(batch["audio_unique_name"])
d_vectors = [self.d_vector_mapping[w]["embedding"] for w in embedding_keys]
else:
d_vectors = None
# get numerical speaker ids from speaker names
if self.speaker_id_mapping:
speaker_ids = [self.speaker_id_mapping[sn] for sn in batch["speaker_name"]]
else:
speaker_ids = None
# compute features
mel = [self.ap.melspectrogram(w).astype("float32") for w in batch["wav"]]
mel_lengths = [m.shape[1] for m in mel]
# lengths adjusted by the reduction factor
mel_lengths_adjusted = [
m.shape[1] + (self.outputs_per_step - (m.shape[1] % self.outputs_per_step))
if m.shape[1] % self.outputs_per_step
else m.shape[1]
for m in mel
]
# compute 'stop token' targets
stop_targets = [np.array([0.0] * (mel_len - 1) + [1.0]) for mel_len in mel_lengths]
# PAD stop targets
stop_targets = prepare_stop_target(stop_targets, self.outputs_per_step)
# PAD sequences with longest instance in the batch
token_ids = prepare_data(batch["token_ids"]).astype(np.int32)
# PAD features with longest instance
mel = prepare_tensor(mel, self.outputs_per_step)
# B x D x T --> B x T x D
mel = mel.transpose(0, 2, 1)
# convert things to pytorch
token_ids_lengths = torch.LongTensor(token_ids_lengths)
token_ids = torch.LongTensor(token_ids)
mel = torch.FloatTensor(mel).contiguous()
mel_lengths = torch.LongTensor(mel_lengths)
stop_targets = torch.FloatTensor(stop_targets)
# speaker vectors
if d_vectors is not None:
d_vectors = torch.FloatTensor(d_vectors)
if speaker_ids is not None:
speaker_ids = torch.LongTensor(speaker_ids)
if language_ids is not None:
language_ids = torch.LongTensor(language_ids)
# compute linear spectrogram
linear = None
if self.compute_linear_spec:
linear = [self.ap.spectrogram(w).astype("float32") for w in batch["wav"]]
linear = prepare_tensor(linear, self.outputs_per_step)
linear = linear.transpose(0, 2, 1)
assert mel.shape[1] == linear.shape[1]
linear = torch.FloatTensor(linear).contiguous()
# format waveforms
wav_padded = None
if self.return_wav:
wav_lengths = [w.shape[0] for w in batch["wav"]]
max_wav_len = max(mel_lengths_adjusted) * self.ap.hop_length
wav_lengths = torch.LongTensor(wav_lengths)
wav_padded = torch.zeros(len(batch["wav"]), 1, max_wav_len)
for i, w in enumerate(batch["wav"]):
mel_length = mel_lengths_adjusted[i]
w = np.pad(w, (0, self.ap.hop_length * self.outputs_per_step), mode="edge")
w = w[: mel_length * self.ap.hop_length]
wav_padded[i, :, : w.shape[0]] = torch.from_numpy(w)
wav_padded.transpose_(1, 2)
# format F0
if self.compute_f0:
pitch = prepare_data(batch["pitch"])
assert mel.shape[1] == pitch.shape[1], f"[!] {mel.shape} vs {pitch.shape}"
pitch = torch.FloatTensor(pitch)[:, None, :].contiguous() # B x 1 xT
else:
pitch = None
# format energy
if self.compute_energy:
energy = prepare_data(batch["energy"])
assert mel.shape[1] == energy.shape[1], f"[!] {mel.shape} vs {energy.shape}"
energy = torch.FloatTensor(energy)[:, None, :].contiguous() # B x 1 xT
else:
energy = None
# format attention masks
attns = None
if batch["attn"][0] is not None:
attns = [batch["attn"][idx].T for idx in ids_sorted_decreasing]
for idx, attn in enumerate(attns):
pad2 = mel.shape[1] - attn.shape[1]
pad1 = token_ids.shape[1] - attn.shape[0]
assert pad1 >= 0 and pad2 >= 0, f"[!] Negative padding - {pad1} and {pad2}"
attn = np.pad(attn, [[0, pad1], [0, pad2]])
attns[idx] = attn
attns = prepare_tensor(attns, self.outputs_per_step)
attns = torch.FloatTensor(attns).unsqueeze(1)
return {
"token_id": token_ids,
"token_id_lengths": token_ids_lengths,
"speaker_names": batch["speaker_name"],
"linear": linear,
"mel": mel,
"mel_lengths": mel_lengths,
"stop_targets": stop_targets,
"item_idxs": batch["item_idx"],
"d_vectors": d_vectors,
"speaker_ids": speaker_ids,
"attns": attns,
"waveform": wav_padded,
"raw_text": batch["raw_text"],
"pitch": pitch,
"energy": energy,
"language_ids": language_ids,
"audio_unique_names": batch["audio_unique_name"],
}
raise TypeError(
(
"batch must contain tensors, numbers, dicts or lists;\
found {}".format(
type(batch[0])
)
)
)
class PhonemeDataset(Dataset):
"""Phoneme Dataset for converting input text to phonemes and then token IDs
At initialization, it pre-computes the phonemes under `cache_path` and loads them in training to reduce data
loading latency. If `cache_path` is already present, it skips the pre-computation.
Args:
samples (Union[List[List], List[Dict]]):
List of samples. Each sample is a list or a dict.
tokenizer (TTSTokenizer):
Tokenizer to convert input text to phonemes.
cache_path (str):
Path to cache phonemes. If `cache_path` is already present or None, it skips the pre-computation.
precompute_num_workers (int):
Number of workers used for pre-computing the phonemes. Defaults to 0.
"""
def __init__(
self,
samples: Union[List[Dict], List[List]],
tokenizer: "TTSTokenizer",
cache_path: str,
precompute_num_workers=0,
):
self.samples = samples
self.tokenizer = tokenizer
self.cache_path = cache_path
if cache_path is not None and not os.path.exists(cache_path):
os.makedirs(cache_path)
self.precompute(precompute_num_workers)
def __getitem__(self, index):
item = self.samples[index]
ids = self.compute_or_load(string2filename(item["audio_unique_name"]), item["text"], item["language"])
ph_hat = self.tokenizer.ids_to_text(ids)
return {"text": item["text"], "ph_hat": ph_hat, "token_ids": ids, "token_ids_len": len(ids)}
def __len__(self):
return len(self.samples)
def compute_or_load(self, file_name, text, language):
"""Compute phonemes for the given text.
If the phonemes are already cached, load them from cache.
"""
file_ext = "_phoneme.npy"
cache_path = os.path.join(self.cache_path, file_name + file_ext)
try:
ids = np.load(cache_path)
except FileNotFoundError:
ids = self.tokenizer.text_to_ids(text, language=language)
np.save(cache_path, ids)
return ids
def get_pad_id(self):
"""Get pad token ID for sequence padding"""
return self.tokenizer.pad_id
def precompute(self, num_workers=1):
"""Precompute phonemes for all samples.
We use pytorch dataloader because we are lazy.
"""
print("[*] Pre-computing phonemes...")
with tqdm.tqdm(total=len(self)) as pbar:
batch_size = num_workers if num_workers > 0 else 1
dataloder = torch.utils.data.DataLoader(
batch_size=batch_size, dataset=self, shuffle=False, num_workers=num_workers, collate_fn=self.collate_fn
)
for _ in dataloder:
pbar.update(batch_size)
def collate_fn(self, batch):
ids = [item["token_ids"] for item in batch]
ids_lens = [item["token_ids_len"] for item in batch]
texts = [item["text"] for item in batch]
texts_hat = [item["ph_hat"] for item in batch]
ids_lens_max = max(ids_lens)
ids_torch = torch.LongTensor(len(ids), ids_lens_max).fill_(self.get_pad_id())
for i, ids_len in enumerate(ids_lens):
ids_torch[i, :ids_len] = torch.LongTensor(ids[i])
return {"text": texts, "ph_hat": texts_hat, "token_ids": ids_torch}
def print_logs(self, level: int = 0) -> None:
indent = "\t" * level
print("\n")
print(f"{indent}> PhonemeDataset ")
print(f"{indent}| > Tokenizer:")
self.tokenizer.print_logs(level + 1)
print(f"{indent}| > Number of instances : {len(self.samples)}")
class F0Dataset:
"""F0 Dataset for computing F0 from wav files in CPU
Pre-compute F0 values for all the samples at initialization if `cache_path` is not None or already present. It
also computes the mean and std of F0 values if `normalize_f0` is True.
Args:
samples (Union[List[List], List[Dict]]):
List of samples. Each sample is a list or a dict.
ap (AudioProcessor):
AudioProcessor to compute F0 from wav files.
cache_path (str):
Path to cache F0 values. If `cache_path` is already present or None, it skips the pre-computation.
Defaults to None.
precompute_num_workers (int):
Number of workers used for pre-computing the F0 values. Defaults to 0.
normalize_f0 (bool):
Whether to normalize F0 values by mean and std. Defaults to True.
"""
def __init__(
self,
samples: Union[List[List], List[Dict]],
ap: "AudioProcessor",
audio_config=None, # pylint: disable=unused-argument
verbose=False,
cache_path: str = None,
precompute_num_workers=0,
normalize_f0=True,
):
self.samples = samples
self.ap = ap
self.verbose = verbose
self.cache_path = cache_path
self.normalize_f0 = normalize_f0
self.pad_id = 0.0
self.mean = None
self.std = None
if cache_path is not None and not os.path.exists(cache_path):
os.makedirs(cache_path)
self.precompute(precompute_num_workers)
if normalize_f0:
self.load_stats(cache_path)
def __getitem__(self, idx):
item = self.samples[idx]
f0 = self.compute_or_load(item["audio_file"], string2filename(item["audio_unique_name"]))
if self.normalize_f0:
assert self.mean is not None and self.std is not None, " [!] Mean and STD is not available"
f0 = self.normalize(f0)
return {"audio_unique_name": item["audio_unique_name"], "f0": f0}
def __len__(self):
return len(self.samples)
def precompute(self, num_workers=0):
print("[*] Pre-computing F0s...")
with tqdm.tqdm(total=len(self)) as pbar:
batch_size = num_workers if num_workers > 0 else 1
# we do not normalize at preproessing
normalize_f0 = self.normalize_f0
self.normalize_f0 = False
dataloder = torch.utils.data.DataLoader(
batch_size=batch_size, dataset=self, shuffle=False, num_workers=num_workers, collate_fn=self.collate_fn
)
computed_data = []
for batch in dataloder:
f0 = batch["f0"]
computed_data.append(f for f in f0)
pbar.update(batch_size)
self.normalize_f0 = normalize_f0
if self.normalize_f0:
computed_data = [tensor for batch in computed_data for tensor in batch] # flatten
pitch_mean, pitch_std = self.compute_pitch_stats(computed_data)
pitch_stats = {"mean": pitch_mean, "std": pitch_std}
np.save(os.path.join(self.cache_path, "pitch_stats"), pitch_stats, allow_pickle=True)
def get_pad_id(self):
return self.pad_id
@staticmethod
def create_pitch_file_path(file_name, cache_path):
pitch_file = os.path.join(cache_path, file_name + "_pitch.npy")
return pitch_file
@staticmethod
def _compute_and_save_pitch(ap, wav_file, pitch_file=None):
wav = ap.load_wav(wav_file)
pitch = ap.compute_f0(wav)
if pitch_file:
np.save(pitch_file, pitch)
return pitch
@staticmethod
def compute_pitch_stats(pitch_vecs):
nonzeros = np.concatenate([v[np.where(v != 0.0)[0]] for v in pitch_vecs])
mean, std = np.mean(nonzeros), np.std(nonzeros)
return mean, std
def load_stats(self, cache_path):
stats_path = os.path.join(cache_path, "pitch_stats.npy")
stats = np.load(stats_path, allow_pickle=True).item()
self.mean = stats["mean"].astype(np.float32)
self.std = stats["std"].astype(np.float32)
def normalize(self, pitch):
zero_idxs = np.where(pitch == 0.0)[0]
pitch = pitch - self.mean
pitch = pitch / self.std
pitch[zero_idxs] = 0.0
return pitch
def denormalize(self, pitch):
zero_idxs = np.where(pitch == 0.0)[0]
pitch *= self.std
pitch += self.mean
pitch[zero_idxs] = 0.0
return pitch
def compute_or_load(self, wav_file, audio_unique_name):
"""
compute pitch and return a numpy array of pitch values
"""
pitch_file = self.create_pitch_file_path(audio_unique_name, self.cache_path)
if not os.path.exists(pitch_file):
pitch = self._compute_and_save_pitch(self.ap, wav_file, pitch_file)
else:
pitch = np.load(pitch_file)
return pitch.astype(np.float32)
def collate_fn(self, batch):
audio_unique_name = [item["audio_unique_name"] for item in batch]
f0s = [item["f0"] for item in batch]
f0_lens = [len(item["f0"]) for item in batch]
f0_lens_max = max(f0_lens)
f0s_torch = torch.LongTensor(len(f0s), f0_lens_max).fill_(self.get_pad_id())
for i, f0_len in enumerate(f0_lens):
f0s_torch[i, :f0_len] = torch.LongTensor(f0s[i])
return {"audio_unique_name": audio_unique_name, "f0": f0s_torch, "f0_lens": f0_lens}
def print_logs(self, level: int = 0) -> None:
indent = "\t" * level
print("\n")
print(f"{indent}> F0Dataset ")
print(f"{indent}| > Number of instances : {len(self.samples)}")
class EnergyDataset:
"""Energy Dataset for computing Energy from wav files in CPU
Pre-compute Energy values for all the samples at initialization if `cache_path` is not None or already present. It
also computes the mean and std of Energy values if `normalize_Energy` is True.
Args:
samples (Union[List[List], List[Dict]]):
List of samples. Each sample is a list or a dict.
ap (AudioProcessor):
AudioProcessor to compute Energy from wav files.
cache_path (str):
Path to cache Energy values. If `cache_path` is already present or None, it skips the pre-computation.
Defaults to None.
precompute_num_workers (int):
Number of workers used for pre-computing the Energy values. Defaults to 0.
normalize_Energy (bool):
Whether to normalize Energy values by mean and std. Defaults to True.
"""
def __init__(
self,
samples: Union[List[List], List[Dict]],
ap: "AudioProcessor",
verbose=False,
cache_path: str = None,
precompute_num_workers=0,
normalize_energy=True,
):
self.samples = samples
self.ap = ap
self.verbose = verbose
self.cache_path = cache_path
self.normalize_energy = normalize_energy
self.pad_id = 0.0
self.mean = None
self.std = None
if cache_path is not None and not os.path.exists(cache_path):
os.makedirs(cache_path)
self.precompute(precompute_num_workers)
if normalize_energy:
self.load_stats(cache_path)
def __getitem__(self, idx):
item = self.samples[idx]
energy = self.compute_or_load(item["audio_file"], string2filename(item["audio_unique_name"]))
if self.normalize_energy:
assert self.mean is not None and self.std is not None, " [!] Mean and STD is not available"
energy = self.normalize(energy)
return {"audio_unique_name": item["audio_unique_name"], "energy": energy}
def __len__(self):
return len(self.samples)
def precompute(self, num_workers=0):
print("[*] Pre-computing energys...")
with tqdm.tqdm(total=len(self)) as pbar:
batch_size = num_workers if num_workers > 0 else 1
# we do not normalize at preproessing
normalize_energy = self.normalize_energy
self.normalize_energy = False
dataloder = torch.utils.data.DataLoader(
batch_size=batch_size, dataset=self, shuffle=False, num_workers=num_workers, collate_fn=self.collate_fn
)
computed_data = []
for batch in dataloder:
energy = batch["energy"]
computed_data.append(e for e in energy)
pbar.update(batch_size)
self.normalize_energy = normalize_energy
if self.normalize_energy:
computed_data = [tensor for batch in computed_data for tensor in batch] # flatten
energy_mean, energy_std = self.compute_energy_stats(computed_data)
energy_stats = {"mean": energy_mean, "std": energy_std}
np.save(os.path.join(self.cache_path, "energy_stats"), energy_stats, allow_pickle=True)
def get_pad_id(self):
return self.pad_id
@staticmethod
def create_energy_file_path(wav_file, cache_path):
file_name = os.path.splitext(os.path.basename(wav_file))[0]
energy_file = os.path.join(cache_path, file_name + "_energy.npy")
return energy_file
@staticmethod
def _compute_and_save_energy(ap, wav_file, energy_file=None):
wav = ap.load_wav(wav_file)
energy = calculate_energy(wav, fft_size=ap.fft_size, hop_length=ap.hop_length, win_length=ap.win_length)
if energy_file:
np.save(energy_file, energy)
return energy
@staticmethod
def compute_energy_stats(energy_vecs):
nonzeros = np.concatenate([v[np.where(v != 0.0)[0]] for v in energy_vecs])
mean, std = np.mean(nonzeros), np.std(nonzeros)
return mean, std
def load_stats(self, cache_path):
stats_path = os.path.join(cache_path, "energy_stats.npy")
stats = np.load(stats_path, allow_pickle=True).item()
self.mean = stats["mean"].astype(np.float32)
self.std = stats["std"].astype(np.float32)
def normalize(self, energy):
zero_idxs = np.where(energy == 0.0)[0]
energy = energy - self.mean
energy = energy / self.std
energy[zero_idxs] = 0.0
return energy
def denormalize(self, energy):
zero_idxs = np.where(energy == 0.0)[0]
energy *= self.std
energy += self.mean
energy[zero_idxs] = 0.0
return energy
def compute_or_load(self, wav_file, audio_unique_name):
"""
compute energy and return a numpy array of energy values
"""
energy_file = self.create_energy_file_path(audio_unique_name, self.cache_path)
if not os.path.exists(energy_file):
energy = self._compute_and_save_energy(self.ap, wav_file, energy_file)
else:
energy = np.load(energy_file)
return energy.astype(np.float32)
def collate_fn(self, batch):
audio_unique_name = [item["audio_unique_name"] for item in batch]
energys = [item["energy"] for item in batch]
energy_lens = [len(item["energy"]) for item in batch]
energy_lens_max = max(energy_lens)
energys_torch = torch.LongTensor(len(energys), energy_lens_max).fill_(self.get_pad_id())
for i, energy_len in enumerate(energy_lens):
energys_torch[i, :energy_len] = torch.LongTensor(energys[i])
return {"audio_unique_name": audio_unique_name, "energy": energys_torch, "energy_lens": energy_lens}
def print_logs(self, level: int = 0) -> None:
indent = "\t" * level
print("\n")
print(f"{indent}> energyDataset ")
print(f"{indent}| > Number of instances : {len(self.samples)}")
+655
View File
@@ -0,0 +1,655 @@
import os
import re
import xml.etree.ElementTree as ET
from glob import glob
from pathlib import Path
from typing import List
import pandas as pd
from tqdm import tqdm
########################
# DATASETS
########################
def cml_tts(root_path, meta_file, ignored_speakers=None):
"""Normalizes the CML-TTS meta data file to TTS format
https://github.com/freds0/CML-TTS-Dataset/"""
filepath = os.path.join(root_path, meta_file)
# ensure there are 4 columns for every line
with open(filepath, "r", encoding="utf8") as f:
lines = f.readlines()
num_cols = len(lines[0].split("|")) # take the first row as reference
for idx, line in enumerate(lines[1:]):
if len(line.split("|")) != num_cols:
print(f" > Missing column in line {idx + 1} -> {line.strip()}")
# load metadata
metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
assert all(x in metadata.columns for x in ["wav_filename", "transcript"])
client_id = None if "client_id" in metadata.columns else "default"
emotion_name = None if "emotion_name" in metadata.columns else "neutral"
items = []
not_found_counter = 0
for row in metadata.itertuples():
if client_id is None and ignored_speakers is not None and row.client_id in ignored_speakers:
continue
audio_path = os.path.join(root_path, row.wav_filename)
if not os.path.exists(audio_path):
not_found_counter += 1
continue
items.append(
{
"text": row.transcript,
"audio_file": audio_path,
"speaker_name": client_id if client_id is not None else row.client_id,
"emotion_name": emotion_name if emotion_name is not None else row.emotion_name,
"root_path": root_path,
}
)
if not_found_counter > 0:
print(f" | > [!] {not_found_counter} files not found")
return items
def coqui(root_path, meta_file, ignored_speakers=None):
"""Interal dataset formatter."""
filepath = os.path.join(root_path, meta_file)
# ensure there are 4 columns for every line
with open(filepath, "r", encoding="utf8") as f:
lines = f.readlines()
num_cols = len(lines[0].split("|")) # take the first row as reference
for idx, line in enumerate(lines[1:]):
if len(line.split("|")) != num_cols:
print(f" > Missing column in line {idx + 1} -> {line.strip()}")
# load metadata
metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
assert all(x in metadata.columns for x in ["audio_file", "text"])
speaker_name = None if "speaker_name" in metadata.columns else "coqui"
emotion_name = None if "emotion_name" in metadata.columns else "neutral"
items = []
not_found_counter = 0
for row in metadata.itertuples():
if speaker_name is None and ignored_speakers is not None and row.speaker_name in ignored_speakers:
continue
audio_path = os.path.join(root_path, row.audio_file)
if not os.path.exists(audio_path):
not_found_counter += 1
continue
items.append(
{
"text": row.text,
"audio_file": audio_path,
"speaker_name": speaker_name if speaker_name is not None else row.speaker_name,
"emotion_name": emotion_name if emotion_name is not None else row.emotion_name,
"root_path": root_path,
}
)
if not_found_counter > 0:
print(f" | > [!] {not_found_counter} files not found")
return items
def tweb(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Normalize TWEB dataset.
https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset
"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "tweb"
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
cols = line.split("\t")
wav_file = os.path.join(root_path, cols[0] + ".wav")
text = cols[1]
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
return items
def mozilla(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Normalizes Mozilla meta data files to TTS format"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "mozilla"
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
cols = line.split("|")
wav_file = cols[1].strip()
text = cols[0].strip()
wav_file = os.path.join(root_path, "wavs", wav_file)
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
return items
def mozilla_de(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Normalizes Mozilla meta data files to TTS format"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "mozilla"
with open(txt_file, "r", encoding="ISO 8859-1") as ttf:
for line in ttf:
cols = line.strip().split("|")
wav_file = cols[0].strip()
text = cols[1].strip()
folder_name = f"BATCH_{wav_file.split('_')[0]}_FINAL"
wav_file = os.path.join(root_path, folder_name, wav_file)
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
return items
def mailabs(root_path, meta_files=None, ignored_speakers=None):
"""Normalizes M-AI-Labs meta data files to TTS format
Args:
root_path (str): root folder of the MAILAB language folder.
meta_files (str): list of meta files to be used in the training. If None, finds all the csv files
recursively. Defaults to None
"""
speaker_regex = re.compile(f"by_book{os.sep}(male|female){os.sep}(?P<speaker_name>[^{os.sep}]+){os.sep}")
if not meta_files:
csv_files = glob(root_path + f"{os.sep}**{os.sep}metadata.csv", recursive=True)
else:
csv_files = meta_files
# meta_files = [f.strip() for f in meta_files.split(",")]
items = []
for csv_file in csv_files:
if os.path.isfile(csv_file):
txt_file = csv_file
else:
txt_file = os.path.join(root_path, csv_file)
folder = os.path.dirname(txt_file)
# determine speaker based on folder structure...
speaker_name_match = speaker_regex.search(txt_file)
if speaker_name_match is None:
continue
speaker_name = speaker_name_match.group("speaker_name")
# ignore speakers
if isinstance(ignored_speakers, list):
if speaker_name in ignored_speakers:
continue
print(" | > {}".format(csv_file))
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
cols = line.split("|")
if not meta_files:
wav_file = os.path.join(folder, "wavs", cols[0] + ".wav")
else:
wav_file = os.path.join(root_path, folder.replace("metadata.csv", ""), "wavs", cols[0] + ".wav")
if os.path.isfile(wav_file):
text = cols[1].strip()
items.append(
{"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path}
)
else:
# M-AI-Labs have some missing samples, so just print the warning
print("> File %s does not exist!" % (wav_file))
return items
def ljspeech(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Normalizes the LJSpeech meta data file to TTS format
https://keithito.com/LJ-Speech-Dataset/"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "ljspeech"
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
cols = line.split("|")
wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
text = cols[2]
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
return items
def ljspeech_test(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Normalizes the LJSpeech meta data file for TTS testing
https://keithito.com/LJ-Speech-Dataset/"""
txt_file = os.path.join(root_path, meta_file)
items = []
with open(txt_file, "r", encoding="utf-8") as ttf:
speaker_id = 0
for idx, line in enumerate(ttf):
# 2 samples per speaker to avoid eval split issues
if idx % 2 == 0:
speaker_id += 1
cols = line.split("|")
wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
text = cols[2]
items.append(
{"text": text, "audio_file": wav_file, "speaker_name": f"ljspeech-{speaker_id}", "root_path": root_path}
)
return items
def thorsten(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Normalizes the thorsten meta data file to TTS format
https://github.com/thorstenMueller/deep-learning-german-tts/"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "thorsten"
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
cols = line.split("|")
wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
text = cols[1]
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
return items
def sam_accenture(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Normalizes the sam-accenture meta data file to TTS format
https://github.com/Sam-Accenture-Non-Binary-Voice/non-binary-voice-files"""
xml_file = os.path.join(root_path, "voice_over_recordings", meta_file)
xml_root = ET.parse(xml_file).getroot()
items = []
speaker_name = "sam_accenture"
for item in xml_root.findall("./fileid"):
text = item.text
wav_file = os.path.join(root_path, "vo_voice_quality_transformation", item.get("id") + ".wav")
if not os.path.exists(wav_file):
print(f" [!] {wav_file} in metafile does not exist. Skipping...")
continue
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
return items
def ruslan(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Normalizes the RUSLAN meta data file to TTS format
https://ruslan-corpus.github.io/"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "ruslan"
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
cols = line.split("|")
wav_file = os.path.join(root_path, "RUSLAN", cols[0] + ".wav")
text = cols[1]
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
return items
def css10(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Normalizes the CSS10 dataset file to TTS format"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "css10"
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
cols = line.split("|")
wav_file = os.path.join(root_path, cols[0])
text = cols[1]
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
return items
def nancy(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Normalizes the Nancy meta data file to TTS format"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "nancy"
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
utt_id = line.split()[1]
text = line[line.find('"') + 1 : line.rfind('"') - 1]
wav_file = os.path.join(root_path, "wavn", utt_id + ".wav")
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
return items
def common_voice(root_path, meta_file, ignored_speakers=None):
"""Normalize the common voice meta data file to TTS format."""
txt_file = os.path.join(root_path, meta_file)
items = []
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
if line.startswith("client_id"):
continue
cols = line.split("\t")
text = cols[2]
speaker_name = cols[0]
# ignore speakers
if isinstance(ignored_speakers, list):
if speaker_name in ignored_speakers:
continue
wav_file = os.path.join(root_path, "clips", cols[1].replace(".mp3", ".wav"))
items.append(
{"text": text, "audio_file": wav_file, "speaker_name": "MCV_" + speaker_name, "root_path": root_path}
)
return items
def libri_tts(root_path, meta_files=None, ignored_speakers=None):
"""https://ai.google/tools/datasets/libri-tts/"""
items = []
if not meta_files:
meta_files = glob(f"{root_path}/**/*trans.tsv", recursive=True)
else:
if isinstance(meta_files, str):
meta_files = [os.path.join(root_path, meta_files)]
for meta_file in meta_files:
_meta_file = os.path.basename(meta_file).split(".")[0]
with open(meta_file, "r", encoding="utf-8") as ttf:
for line in ttf:
cols = line.split("\t")
file_name = cols[0]
speaker_name, chapter_id, *_ = cols[0].split("_")
_root_path = os.path.join(root_path, f"{speaker_name}/{chapter_id}")
wav_file = os.path.join(_root_path, file_name + ".wav")
text = cols[2]
# ignore speakers
if isinstance(ignored_speakers, list):
if speaker_name in ignored_speakers:
continue
items.append(
{
"text": text,
"audio_file": wav_file,
"speaker_name": f"LTTS_{speaker_name}",
"root_path": root_path,
}
)
for item in items:
assert os.path.exists(item["audio_file"]), f" [!] wav files don't exist - {item['audio_file']}"
return items
def custom_turkish(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "turkish-female"
skipped_files = []
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
cols = line.split("|")
wav_file = os.path.join(root_path, "wavs", cols[0].strip() + ".wav")
if not os.path.exists(wav_file):
skipped_files.append(wav_file)
continue
text = cols[1].strip()
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
print(f" [!] {len(skipped_files)} files skipped. They don't exist...")
return items
# ToDo: add the dataset link when the dataset is released publicly
def brspeech(root_path, meta_file, ignored_speakers=None):
"""BRSpeech 3.0 beta"""
txt_file = os.path.join(root_path, meta_file)
items = []
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
if line.startswith("wav_filename"):
continue
cols = line.split("|")
wav_file = os.path.join(root_path, cols[0])
text = cols[2]
speaker_id = cols[3]
# ignore speakers
if isinstance(ignored_speakers, list):
if speaker_id in ignored_speakers:
continue
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_id, "root_path": root_path})
return items
def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic1", ignored_speakers=None):
"""VCTK dataset v0.92.
URL:
https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip
This dataset has 2 recordings per speaker that are annotated with ```mic1``` and ```mic2```.
It is believed that (😄 ) ```mic1``` files are the same as the previous version of the dataset.
mic1:
Audio recorded using an omni-directional microphone (DPA 4035).
Contains very low frequency noises.
This is the same audio released in previous versions of VCTK:
https://doi.org/10.7488/ds/1994
mic2:
Audio recorded using a small diaphragm condenser microphone with
very wide bandwidth (Sennheiser MKH 800).
Two speakers, p280 and p315 had technical issues of the audio
recordings using MKH 800.
"""
file_ext = "flac"
items = []
meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True)
for meta_file in meta_files:
_, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep)
file_id = txt_file.split(".")[0]
# ignore speakers
if isinstance(ignored_speakers, list):
if speaker_id in ignored_speakers:
continue
with open(meta_file, "r", encoding="utf-8") as file_text:
text = file_text.readlines()[0]
# p280 has no mic2 recordings
if speaker_id == "p280":
wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + f"_mic1.{file_ext}")
else:
wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + f"_{mic}.{file_ext}")
if os.path.exists(wav_file):
items.append(
{"text": text, "audio_file": wav_file, "speaker_name": "VCTK_" + speaker_id, "root_path": root_path}
)
else:
print(f" [!] wav files don't exist - {wav_file}")
return items
def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None):
"""homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz"""
items = []
meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True)
for meta_file in meta_files:
_, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep)
file_id = txt_file.split(".")[0]
# ignore speakers
if isinstance(ignored_speakers, list):
if speaker_id in ignored_speakers:
continue
with open(meta_file, "r", encoding="utf-8") as file_text:
text = file_text.readlines()[0]
wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav")
items.append(
{"text": text, "audio_file": wav_file, "speaker_name": "VCTK_old_" + speaker_id, "root_path": root_path}
)
return items
def synpaflex(root_path, metafiles=None, **kwargs): # pylint: disable=unused-argument
items = []
speaker_name = "synpaflex"
root_path = os.path.join(root_path, "")
wav_files = glob(f"{root_path}**/*.wav", recursive=True)
for wav_file in wav_files:
if os.sep + "wav" + os.sep in wav_file:
txt_file = wav_file.replace("wav", "txt")
else:
txt_file = os.path.join(
os.path.dirname(wav_file), "txt", os.path.basename(wav_file).replace(".wav", ".txt")
)
if os.path.exists(txt_file) and os.path.exists(wav_file):
with open(txt_file, "r", encoding="utf-8") as file_text:
text = file_text.readlines()[0]
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
return items
def open_bible(root_path, meta_files="train", ignore_digits_sentences=True, ignored_speakers=None):
"""ToDo: Refer the paper when available"""
items = []
split_dir = meta_files
meta_files = glob(f"{os.path.join(root_path, split_dir)}/**/*.txt", recursive=True)
for meta_file in meta_files:
_, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep)
file_id = txt_file.split(".")[0]
# ignore speakers
if isinstance(ignored_speakers, list):
if speaker_id in ignored_speakers:
continue
with open(meta_file, "r", encoding="utf-8") as file_text:
text = file_text.readline().replace("\n", "")
# ignore sentences that contains digits
if ignore_digits_sentences and any(map(str.isdigit, text)):
continue
wav_file = os.path.join(root_path, split_dir, speaker_id, file_id + ".flac")
items.append({"text": text, "audio_file": wav_file, "speaker_name": "OB_" + speaker_id, "root_path": root_path})
return items
def mls(root_path, meta_files=None, ignored_speakers=None):
"""http://www.openslr.org/94/"""
items = []
with open(os.path.join(root_path, meta_files), "r", encoding="utf-8") as meta:
for line in meta:
file, text = line.split("\t")
text = text[:-1]
speaker, book, *_ = file.split("_")
wav_file = os.path.join(root_path, os.path.dirname(meta_files), "audio", speaker, book, file + ".wav")
# ignore speakers
if isinstance(ignored_speakers, list):
if speaker in ignored_speakers:
continue
items.append(
{"text": text, "audio_file": wav_file, "speaker_name": "MLS_" + speaker, "root_path": root_path}
)
return items
# ======================================== VOX CELEB ===========================================
def voxceleb2(root_path, meta_file=None, **kwargs): # pylint: disable=unused-argument
"""
:param meta_file Used only for consistency with load_tts_samples api
"""
return _voxcel_x(root_path, meta_file, voxcel_idx="2")
def voxceleb1(root_path, meta_file=None, **kwargs): # pylint: disable=unused-argument
"""
:param meta_file Used only for consistency with load_tts_samples api
"""
return _voxcel_x(root_path, meta_file, voxcel_idx="1")
def _voxcel_x(root_path, meta_file, voxcel_idx):
assert voxcel_idx in ["1", "2"]
expected_count = 148_000 if voxcel_idx == "1" else 1_000_000
voxceleb_path = Path(root_path)
cache_to = voxceleb_path / f"metafile_voxceleb{voxcel_idx}.csv"
cache_to.parent.mkdir(exist_ok=True)
# if not exists meta file, crawl recursively for 'wav' files
if meta_file is not None:
with open(str(meta_file), "r", encoding="utf-8") as f:
return [x.strip().split("|") for x in f.readlines()]
elif not cache_to.exists():
cnt = 0
meta_data = []
wav_files = voxceleb_path.rglob("**/*.wav")
for path in tqdm(
wav_files,
desc=f"Building VoxCeleb {voxcel_idx} Meta file ... this needs to be done only once.",
total=expected_count,
):
speaker_id = str(Path(path).parent.parent.stem)
assert speaker_id.startswith("id")
text = None # VoxCel does not provide transciptions, and they are not needed for training the SE
meta_data.append(f"{text}|{path}|voxcel{voxcel_idx}_{speaker_id}\n")
cnt += 1
with open(str(cache_to), "w", encoding="utf-8") as f:
f.write("".join(meta_data))
if cnt < expected_count:
raise ValueError(f"Found too few instances for Voxceleb. Should be around {expected_count}, is: {cnt}")
with open(str(cache_to), "r", encoding="utf-8") as f:
return [x.strip().split("|") for x in f.readlines()]
def emotion(root_path, meta_file, ignored_speakers=None):
"""Generic emotion dataset"""
txt_file = os.path.join(root_path, meta_file)
items = []
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
if line.startswith("file_path"):
continue
cols = line.split(",")
wav_file = os.path.join(root_path, cols[0])
speaker_id = cols[1]
emotion_id = cols[2].replace("\n", "")
# ignore speakers
if isinstance(ignored_speakers, list):
if speaker_id in ignored_speakers:
continue
items.append(
{"audio_file": wav_file, "speaker_name": speaker_id, "emotion_name": emotion_id, "root_path": root_path}
)
return items
def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]: # pylint: disable=unused-argument
"""Normalizes the Baker meta data file to TTS format
Args:
root_path (str): path to the baker dataset
meta_file (str): name of the meta dataset containing names of wav to select and the transcript of the sentence
Returns:
List[List[str]]: List of (text, wav_path, speaker_name) associated with each sentences
"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "baker"
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
wav_name, text = line.rstrip("\n").split("|")
wav_path = os.path.join(root_path, "clips_22", wav_name)
items.append({"text": text, "audio_file": wav_path, "speaker_name": speaker_name, "root_path": root_path})
return items
def kokoro(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Japanese single-speaker dataset from https://github.com/kaiidams/Kokoro-Speech-Dataset"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "kokoro"
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
cols = line.split("|")
wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
text = cols[2].replace(" ", "")
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
return items
def kss(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Korean single-speaker dataset from https://www.kaggle.com/datasets/bryanpark/korean-single-speaker-speech-dataset"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "kss"
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
cols = line.split("|")
wav_file = os.path.join(root_path, cols[0])
text = cols[2] # cols[1] => 6월, cols[2] => 유월
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
return items
def bel_tts_formatter(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "bel_tts"
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
cols = line.split("|")
wav_file = os.path.join(root_path, cols[0])
text = cols[1]
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
return items
+14
View File
@@ -0,0 +1,14 @@
from typing import Dict, List, Union
from TTS.utils.generic_utils import find_module
def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseTTS":
print(" > Using model: {}".format(config.model))
# fetch the right model implementation.
if "base_model" in config and config["base_model"] is not None:
MyModel = find_module("TTS.tts.models", config.base_model.lower())
else:
MyModel = find_module("TTS.tts.models", config.model.lower())
model = MyModel.init_from_config(config=config, samples=samples)
return model
+448
View File
@@ -0,0 +1,448 @@
from dataclasses import dataclass, field
from typing import Dict, List, Union
import torch
from coqpit import Coqpit
from torch import nn
from TTS.tts.layers.align_tts.mdn import MDNBlock
from TTS.tts.layers.feed_forward.decoder import Decoder
from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor
from TTS.tts.layers.feed_forward.encoder import Encoder
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
from TTS.utils.io import load_fsspec
@dataclass
class AlignTTSArgs(Coqpit):
"""
Args:
num_chars (int):
number of unique input to characters
out_channels (int):
number of output tensor channels. It is equal to the expected spectrogram size.
hidden_channels (int):
number of channels in all the model layers.
hidden_channels_ffn (int):
number of channels in transformer's conv layers.
hidden_channels_dp (int):
number of channels in duration predictor network.
num_heads (int):
number of attention heads in transformer networks.
num_transformer_layers (int):
number of layers in encoder and decoder transformer blocks.
dropout_p (int):
dropout rate in transformer layers.
length_scale (int, optional):
coefficient to set the speech speed. <1 slower, >1 faster. Defaults to 1.
num_speakers (int, optional):
number of speakers for multi-speaker training. Defaults to 0.
external_c (bool, optional):
enable external speaker embeddings. Defaults to False.
c_in_channels (int, optional):
number of channels in speaker embedding vectors. Defaults to 0.
"""
num_chars: int = None
out_channels: int = 80
hidden_channels: int = 256
hidden_channels_dp: int = 256
encoder_type: str = "fftransformer"
encoder_params: dict = field(
default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}
)
decoder_type: str = "fftransformer"
decoder_params: dict = field(
default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}
)
length_scale: float = 1.0
num_speakers: int = 0
use_speaker_embedding: bool = False
use_d_vector_file: bool = False
d_vector_dim: int = 0
class AlignTTS(BaseTTS):
"""AlignTTS with modified duration predictor.
https://arxiv.org/pdf/2003.01950.pdf
Encoder -> DurationPredictor -> Decoder
Check :class:`AlignTTSArgs` for the class arguments.
Paper Abstract:
Targeting at both high efficiency and performance, we propose AlignTTS to predict the
mel-spectrum in parallel. AlignTTS is based on a Feed-Forward Transformer which generates mel-spectrum from a
sequence of characters, and the duration of each character is determined by a duration predictor.Instead of
adopting the attention mechanism in Transformer TTS to align text to mel-spectrum, the alignment loss is presented
to consider all possible alignments in training by use of dynamic programming. Experiments on the LJSpeech dataset s
how that our model achieves not only state-of-the-art performance which outperforms Transformer TTS by 0.03 in mean
option score (MOS), but also a high efficiency which is more than 50 times faster than real-time.
Note:
Original model uses a separate character embedding layer for duration predictor. However, it causes the
duration predictor to overfit and prevents learning higher level interactions among characters. Therefore,
we predict durations based on encoder outputs which has higher level information about input characters. This
enables training without phases as in the original paper.
Original model uses Transormers in encoder and decoder layers. However, here you can set the architecture
differently based on your requirements using ```encoder_type``` and ```decoder_type``` parameters.
Examples:
>>> from TTS.tts.configs.align_tts_config import AlignTTSConfig
>>> config = AlignTTSConfig()
>>> model = AlignTTS(config)
"""
# pylint: disable=dangerous-default-value
def __init__(
self,
config: "AlignTTSConfig",
ap: "AudioProcessor" = None,
tokenizer: "TTSTokenizer" = None,
speaker_manager: SpeakerManager = None,
):
super().__init__(config, ap, tokenizer, speaker_manager)
self.speaker_manager = speaker_manager
self.phase = -1
self.length_scale = (
float(config.model_args.length_scale)
if isinstance(config.model_args.length_scale, int)
else config.model_args.length_scale
)
self.emb = nn.Embedding(self.config.model_args.num_chars, self.config.model_args.hidden_channels)
self.embedded_speaker_dim = 0
self.init_multispeaker(config)
self.pos_encoder = PositionalEncoding(config.model_args.hidden_channels)
self.encoder = Encoder(
config.model_args.hidden_channels,
config.model_args.hidden_channels,
config.model_args.encoder_type,
config.model_args.encoder_params,
self.embedded_speaker_dim,
)
self.decoder = Decoder(
config.model_args.out_channels,
config.model_args.hidden_channels,
config.model_args.decoder_type,
config.model_args.decoder_params,
)
self.duration_predictor = DurationPredictor(config.model_args.hidden_channels_dp)
self.mod_layer = nn.Conv1d(config.model_args.hidden_channels, config.model_args.hidden_channels, 1)
self.mdn_block = MDNBlock(config.model_args.hidden_channels, 2 * config.model_args.out_channels)
if self.embedded_speaker_dim > 0 and self.embedded_speaker_dim != config.model_args.hidden_channels:
self.proj_g = nn.Conv1d(self.embedded_speaker_dim, config.model_args.hidden_channels, 1)
@staticmethod
def compute_log_probs(mu, log_sigma, y):
# pylint: disable=protected-access, c-extension-no-member
y = y.transpose(1, 2).unsqueeze(1) # [B, 1, T1, D]
mu = mu.transpose(1, 2).unsqueeze(2) # [B, T2, 1, D]
log_sigma = log_sigma.transpose(1, 2).unsqueeze(2) # [B, T2, 1, D]
expanded_y, expanded_mu = torch.broadcast_tensors(y, mu)
exponential = -0.5 * torch.mean(
torch._C._nn.mse_loss(expanded_y, expanded_mu, 0) / torch.pow(log_sigma.exp(), 2), dim=-1
) # B, L, T
logp = exponential - 0.5 * log_sigma.mean(dim=-1)
return logp
def compute_align_path(self, mu, log_sigma, y, x_mask, y_mask):
# find the max alignment path
attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
log_p = self.compute_log_probs(mu, log_sigma, y)
# [B, T_en, T_dec]
attn = maximum_path(log_p, attn_mask.squeeze(1)).unsqueeze(1)
dr_mas = torch.sum(attn, -1)
return dr_mas.squeeze(1), log_p
@staticmethod
def generate_attn(dr, x_mask, y_mask=None):
# compute decode mask from the durations
if y_mask is None:
y_lengths = dr.sum(1).long()
y_lengths[y_lengths < 1] = 1
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(dr.dtype)
attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
attn = generate_path(dr, attn_mask.squeeze(1)).to(dr.dtype)
return attn
def expand_encoder_outputs(self, en, dr, x_mask, y_mask):
"""Generate attention alignment map from durations and
expand encoder outputs
Examples::
- encoder output: [a,b,c,d]
- durations: [1, 3, 2, 1]
- expanded: [a, b, b, b, c, c, d]
- attention map: [[0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 1, 1, 0],
[0, 1, 1, 1, 0, 0, 0],
[1, 0, 0, 0, 0, 0, 0]]
"""
attn = self.generate_attn(dr, x_mask, y_mask)
o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2), en.transpose(1, 2)).transpose(1, 2)
return o_en_ex, attn
def format_durations(self, o_dr_log, x_mask):
o_dr = (torch.exp(o_dr_log) - 1) * x_mask * self.length_scale
o_dr[o_dr < 1] = 1.0
o_dr = torch.round(o_dr)
return o_dr
@staticmethod
def _concat_speaker_embedding(o_en, g):
g_exp = g.expand(-1, -1, o_en.size(-1)) # [B, C, T_en]
o_en = torch.cat([o_en, g_exp], 1)
return o_en
def _sum_speaker_embedding(self, x, g):
# project g to decoder dim.
if hasattr(self, "proj_g"):
g = self.proj_g(g)
return x + g
def _forward_encoder(self, x, x_lengths, g=None):
if hasattr(self, "emb_g"):
g = nn.functional.normalize(self.speaker_embedding(g)) # [B, C, 1]
if g is not None:
g = g.unsqueeze(-1)
# [B, T, C]
x_emb = self.emb(x)
# [B, C, T]
x_emb = torch.transpose(x_emb, 1, -1)
# compute sequence masks
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).to(x.dtype)
# encoder pass
o_en = self.encoder(x_emb, x_mask)
# speaker conditioning for duration predictor
if g is not None:
o_en_dp = self._concat_speaker_embedding(o_en, g)
else:
o_en_dp = o_en
return o_en, o_en_dp, x_mask, g
def _forward_decoder(self, o_en, o_en_dp, dr, x_mask, y_lengths, g):
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype)
# expand o_en with durations
o_en_ex, attn = self.expand_encoder_outputs(o_en, dr, x_mask, y_mask)
# positional encoding
if hasattr(self, "pos_encoder"):
o_en_ex = self.pos_encoder(o_en_ex, y_mask)
# speaker embedding
if g is not None:
o_en_ex = self._sum_speaker_embedding(o_en_ex, g)
# decoder pass
o_de = self.decoder(o_en_ex, y_mask, g=g)
return o_de, attn.transpose(1, 2)
def _forward_mdn(self, o_en, y, y_lengths, x_mask):
# MAS potentials and alignment
mu, log_sigma = self.mdn_block(o_en)
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en.dtype)
dr_mas, logp = self.compute_align_path(mu, log_sigma, y, x_mask, y_mask)
return dr_mas, mu, log_sigma, logp
def forward(
self, x, x_lengths, y, y_lengths, aux_input={"d_vectors": None}, phase=None
): # pylint: disable=unused-argument
"""
Shapes:
- x: :math:`[B, T_max]`
- x_lengths: :math:`[B]`
- y_lengths: :math:`[B]`
- dr: :math:`[B, T_max]`
- g: :math:`[B, C]`
"""
y = y.transpose(1, 2)
g = aux_input["d_vectors"] if "d_vectors" in aux_input else None
o_de, o_dr_log, dr_mas_log, attn, mu, log_sigma, logp = None, None, None, None, None, None, None
if phase == 0:
# train encoder and MDN
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask)
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype)
attn = self.generate_attn(dr_mas, x_mask, y_mask)
elif phase == 1:
# train decoder
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
dr_mas, _, _, _ = self._forward_mdn(o_en, y, y_lengths, x_mask)
o_de, attn = self._forward_decoder(o_en.detach(), o_en_dp.detach(), dr_mas.detach(), x_mask, y_lengths, g=g)
elif phase == 2:
# train the whole except duration predictor
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask)
o_de, attn = self._forward_decoder(o_en, o_en_dp, dr_mas, x_mask, y_lengths, g=g)
elif phase == 3:
# train duration predictor
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
o_dr_log = self.duration_predictor(x, x_mask)
dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask)
o_de, attn = self._forward_decoder(o_en, o_en_dp, dr_mas, x_mask, y_lengths, g=g)
o_dr_log = o_dr_log.squeeze(1)
else:
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask)
dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask)
o_de, attn = self._forward_decoder(o_en, o_en_dp, dr_mas, x_mask, y_lengths, g=g)
o_dr_log = o_dr_log.squeeze(1)
dr_mas_log = torch.log(dr_mas + 1).squeeze(1)
outputs = {
"model_outputs": o_de.transpose(1, 2),
"alignments": attn,
"durations_log": o_dr_log,
"durations_mas_log": dr_mas_log,
"mu": mu,
"log_sigma": log_sigma,
"logp": logp,
}
return outputs
@torch.no_grad()
def inference(self, x, aux_input={"d_vectors": None}): # pylint: disable=unused-argument
"""
Shapes:
- x: :math:`[B, T_max]`
- x_lengths: :math:`[B]`
- g: :math:`[B, C]`
"""
g = aux_input["d_vectors"] if "d_vectors" in aux_input else None
x_lengths = torch.tensor(x.shape[1:2]).to(x.device)
# pad input to prevent dropping the last word
# x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0)
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
# o_dr_log = self.duration_predictor(x, x_mask)
o_dr_log = self.duration_predictor(o_en_dp, x_mask)
# duration predictor pass
o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1)
y_lengths = o_dr.sum(1)
o_de, attn = self._forward_decoder(o_en, o_en_dp, o_dr, x_mask, y_lengths, g=g)
outputs = {"model_outputs": o_de.transpose(1, 2), "alignments": attn}
return outputs
def train_step(self, batch: dict, criterion: nn.Module):
text_input = batch["text_input"]
text_lengths = batch["text_lengths"]
mel_input = batch["mel_input"]
mel_lengths = batch["mel_lengths"]
d_vectors = batch["d_vectors"]
speaker_ids = batch["speaker_ids"]
aux_input = {"d_vectors": d_vectors, "speaker_ids": speaker_ids}
outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input, self.phase)
loss_dict = criterion(
outputs["logp"],
outputs["model_outputs"],
mel_input,
mel_lengths,
outputs["durations_log"],
outputs["durations_mas_log"],
text_lengths,
phase=self.phase,
)
return outputs, loss_dict
def _create_logs(self, batch, outputs, ap): # pylint: disable=no-self-use
model_outputs = outputs["model_outputs"]
alignments = outputs["alignments"]
mel_input = batch["mel_input"]
pred_spec = model_outputs[0].data.cpu().numpy()
gt_spec = mel_input[0].data.cpu().numpy()
align_img = alignments[0].data.cpu().numpy()
figures = {
"prediction": plot_spectrogram(pred_spec, ap, output_fig=False),
"ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False),
"alignment": plot_alignment(align_img, output_fig=False),
}
# Sample audio
train_audio = ap.inv_melspectrogram(pred_spec.T)
return figures, {"audio": train_audio}
def train_log(
self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
) -> None: # pylint: disable=no-self-use
figures, audios = self._create_logs(batch, outputs, self.ap)
logger.train_figures(steps, figures)
logger.train_audios(steps, audios, self.ap.sample_rate)
def eval_step(self, batch: dict, criterion: nn.Module):
return self.train_step(batch, criterion)
def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:
figures, audios = self._create_logs(batch, outputs, self.ap)
logger.eval_figures(steps, figures)
logger.eval_audios(steps, audios, self.ap.sample_rate)
def load_checkpoint(
self, config, checkpoint_path, eval=False, cache=False
): # pylint: disable=unused-argument, redefined-builtin
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
self.load_state_dict(state["model"])
if eval:
self.eval()
assert not self.training
def get_criterion(self):
from TTS.tts.layers.losses import AlignTTSLoss # pylint: disable=import-outside-toplevel
return AlignTTSLoss(self.config)
@staticmethod
def _set_phase(config, global_step):
"""Decide AlignTTS training phase"""
if isinstance(config.phase_start_steps, list):
vals = [i < global_step for i in config.phase_start_steps]
if not True in vals:
phase = 0
else:
phase = (
len(config.phase_start_steps)
- [i < global_step for i in config.phase_start_steps][::-1].index(True)
- 1
)
else:
phase = None
return phase
def on_epoch_start(self, trainer):
"""Set AlignTTS training phase on epoch start."""
self.phase = self._set_phase(trainer.config, trainer.total_steps_done)
@staticmethod
def init_from_config(config: "AlignTTSConfig", samples: Union[List[List], List[Dict]] = None):
"""Initiate model from config
Args:
config (AlignTTSConfig): Model config.
samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
Defaults to None.
"""
from TTS.utils.audio import AudioProcessor
ap = AudioProcessor.init_from_config(config)
tokenizer, new_config = TTSTokenizer.init_from_config(config)
speaker_manager = SpeakerManager.init_from_config(config, samples)
return AlignTTS(new_config, ap, tokenizer, speaker_manager)
+284
View File
@@ -0,0 +1,284 @@
import os
from dataclasses import dataclass
from typing import Optional
import numpy as np
from coqpit import Coqpit
from encodec import EncodecModel
from transformers import BertTokenizer
from TTS.tts.layers.bark.inference_funcs import (
codec_decode,
generate_coarse,
generate_fine,
generate_text_semantic,
generate_voice,
load_voice,
)
from TTS.tts.layers.bark.load_model import load_model
from TTS.tts.layers.bark.model import GPT
from TTS.tts.layers.bark.model_fine import FineGPT
from TTS.tts.models.base_tts import BaseTTS
@dataclass
class BarkAudioConfig(Coqpit):
sample_rate: int = 24000
output_sample_rate: int = 24000
class Bark(BaseTTS):
def __init__(
self,
config: Coqpit,
tokenizer: BertTokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased"),
) -> None:
super().__init__(config=config, ap=None, tokenizer=None, speaker_manager=None, language_manager=None)
self.config.num_chars = len(tokenizer)
self.tokenizer = tokenizer
self.semantic_model = GPT(config.semantic_config)
self.coarse_model = GPT(config.coarse_config)
self.fine_model = FineGPT(config.fine_config)
self.encodec = EncodecModel.encodec_model_24khz()
self.encodec.set_target_bandwidth(6.0)
@property
def device(self):
return next(self.parameters()).device
def load_bark_models(self):
self.semantic_model, self.config = load_model(
ckpt_path=self.config.LOCAL_MODEL_PATHS["text"], device=self.device, config=self.config, model_type="text"
)
self.coarse_model, self.config = load_model(
ckpt_path=self.config.LOCAL_MODEL_PATHS["coarse"],
device=self.device,
config=self.config,
model_type="coarse",
)
self.fine_model, self.config = load_model(
ckpt_path=self.config.LOCAL_MODEL_PATHS["fine"], device=self.device, config=self.config, model_type="fine"
)
def train_step(
self,
):
pass
def text_to_semantic(
self,
text: str,
history_prompt: Optional[str] = None,
temp: float = 0.7,
base=None,
allow_early_stop=True,
**kwargs,
):
"""Generate semantic array from text.
Args:
text: text to be turned into audio
history_prompt: history choice for audio cloning
temp: generation temperature (1.0 more diverse, 0.0 more conservative)
Returns:
numpy semantic array to be fed into `semantic_to_waveform`
"""
x_semantic = generate_text_semantic(
text,
self,
history_prompt=history_prompt,
temp=temp,
base=base,
allow_early_stop=allow_early_stop,
**kwargs,
)
return x_semantic
def semantic_to_waveform(
self,
semantic_tokens: np.ndarray,
history_prompt: Optional[str] = None,
temp: float = 0.7,
base=None,
):
"""Generate audio array from semantic input.
Args:
semantic_tokens: semantic token output from `text_to_semantic`
history_prompt: history choice for audio cloning
temp: generation temperature (1.0 more diverse, 0.0 more conservative)
Returns:
numpy audio array at sample frequency 24khz
"""
x_coarse_gen = generate_coarse(
semantic_tokens,
self,
history_prompt=history_prompt,
temp=temp,
base=base,
)
x_fine_gen = generate_fine(
x_coarse_gen,
self,
history_prompt=history_prompt,
temp=0.5,
base=base,
)
audio_arr = codec_decode(x_fine_gen, self)
return audio_arr, x_coarse_gen, x_fine_gen
def generate_audio(
self,
text: str,
history_prompt: Optional[str] = None,
text_temp: float = 0.7,
waveform_temp: float = 0.7,
base=None,
allow_early_stop=True,
**kwargs,
):
"""Generate audio array from input text.
Args:
text: text to be turned into audio
history_prompt: history choice for audio cloning
text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
Returns:
numpy audio array at sample frequency 24khz
"""
x_semantic = self.text_to_semantic(
text,
history_prompt=history_prompt,
temp=text_temp,
base=base,
allow_early_stop=allow_early_stop,
**kwargs,
)
audio_arr, c, f = self.semantic_to_waveform(
x_semantic, history_prompt=history_prompt, temp=waveform_temp, base=base
)
return audio_arr, [x_semantic, c, f]
def generate_voice(self, audio, speaker_id, voice_dir):
"""Generate a voice from the given audio and text.
Args:
audio (str): Path to the audio file.
speaker_id (str): Speaker name.
voice_dir (str): Path to the directory to save the generate voice.
"""
if voice_dir is not None:
voice_dirs = [voice_dir]
try:
_ = load_voice(speaker_id, voice_dirs)
except (KeyError, FileNotFoundError):
output_path = os.path.join(voice_dir, speaker_id + ".npz")
os.makedirs(voice_dir, exist_ok=True)
generate_voice(audio, self, output_path)
def _set_voice_dirs(self, voice_dirs):
def_voice_dir = None
if isinstance(self.config.DEF_SPEAKER_DIR, str):
os.makedirs(self.config.DEF_SPEAKER_DIR, exist_ok=True)
if os.path.isdir(self.config.DEF_SPEAKER_DIR):
def_voice_dir = self.config.DEF_SPEAKER_DIR
_voice_dirs = [def_voice_dir] if def_voice_dir is not None else []
if voice_dirs is not None:
if isinstance(voice_dirs, str):
voice_dirs = [voice_dirs]
_voice_dirs = voice_dirs + _voice_dirs
return _voice_dirs
# TODO: remove config from synthesize
def synthesize(
self, text, config, speaker_id="random", voice_dirs=None, **kwargs
): # pylint: disable=unused-argument
"""Synthesize speech with the given input text.
Args:
text (str): Input text.
config (BarkConfig): Config with inference parameters.
speaker_id (str): One of the available speaker names. If `random`, it generates a random speaker.
speaker_wav (str): Path to the speaker audio file for cloning a new voice. It is cloned and saved in
`voice_dirs` with the name `speaker_id`. Defaults to None.
voice_dirs (List[str]): List of paths that host reference audio files for speakers. Defaults to None.
**kwargs: Model specific inference settings used by `generate_audio()` and `TTS.tts.layers.bark.inference_funcs.generate_text_semantic().
Returns:
A dictionary of the output values with `wav` as output waveform, `deterministic_seed` as seed used at inference,
`text_input` as text token IDs after tokenizer, `voice_samples` as samples used for cloning, `conditioning_latents`
as latents used at inference.
"""
speaker_id = "random" if speaker_id is None else speaker_id
voice_dirs = self._set_voice_dirs(voice_dirs)
history_prompt = load_voice(self, speaker_id, voice_dirs)
outputs = self.generate_audio(text, history_prompt=history_prompt, **kwargs)
return_dict = {
"wav": outputs[0],
"text_inputs": text,
}
return return_dict
def eval_step(self):
...
def forward(self):
...
def inference(self):
...
@staticmethod
def init_from_config(config: "BarkConfig", **kwargs): # pylint: disable=unused-argument
return Bark(config)
# pylint: disable=unused-argument, redefined-builtin
def load_checkpoint(
self,
config,
checkpoint_dir,
text_model_path=None,
coarse_model_path=None,
fine_model_path=None,
hubert_model_path=None,
hubert_tokenizer_path=None,
eval=False,
strict=True,
**kwargs,
):
"""Load a model checkpoints from a directory. This model is with multiple checkpoint files and it
expects to have all the files to be under the given `checkpoint_dir` with the rigth names.
If eval is True, set the model to eval mode.
Args:
config (TortoiseConfig): The model config.
checkpoint_dir (str): The directory where the checkpoints are stored.
ar_checkpoint_path (str, optional): The path to the autoregressive checkpoint. Defaults to None.
diff_checkpoint_path (str, optional): The path to the diffusion checkpoint. Defaults to None.
clvp_checkpoint_path (str, optional): The path to the CLVP checkpoint. Defaults to None.
vocoder_checkpoint_path (str, optional): The path to the vocoder checkpoint. Defaults to None.
eval (bool, optional): Whether to set the model to eval mode. Defaults to False.
strict (bool, optional): Whether to load the model strictly. Defaults to True.
"""
text_model_path = text_model_path or os.path.join(checkpoint_dir, "text_2.pt")
coarse_model_path = coarse_model_path or os.path.join(checkpoint_dir, "coarse_2.pt")
fine_model_path = fine_model_path or os.path.join(checkpoint_dir, "fine_2.pt")
hubert_model_path = hubert_model_path or os.path.join(checkpoint_dir, "hubert.pt")
hubert_tokenizer_path = hubert_tokenizer_path or os.path.join(checkpoint_dir, "tokenizer.pth")
self.config.LOCAL_MODEL_PATHS["text"] = text_model_path
self.config.LOCAL_MODEL_PATHS["coarse"] = coarse_model_path
self.config.LOCAL_MODEL_PATHS["fine"] = fine_model_path
self.config.LOCAL_MODEL_PATHS["hubert"] = hubert_model_path
self.config.LOCAL_MODEL_PATHS["hubert_tokenizer"] = hubert_tokenizer_path
self.load_bark_models()
if eval:
self.eval()
+305
View File
@@ -0,0 +1,305 @@
import copy
from abc import abstractmethod
from typing import Dict, Tuple
import torch
from coqpit import Coqpit
from torch import nn
from TTS.tts.layers.losses import TacotronLoss
from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.helpers import sequence_mask
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
from TTS.utils.generic_utils import format_aux_input
from TTS.utils.io import load_fsspec
from TTS.utils.training import gradual_training_scheduler
class BaseTacotron(BaseTTS):
"""Base class shared by Tacotron and Tacotron2"""
def __init__(
self,
config: "TacotronConfig",
ap: "AudioProcessor",
tokenizer: "TTSTokenizer",
speaker_manager: SpeakerManager = None,
):
super().__init__(config, ap, tokenizer, speaker_manager)
# pass all config fields as class attributes
for key in config:
setattr(self, key, config[key])
# layers
self.embedding = None
self.encoder = None
self.decoder = None
self.postnet = None
# init tensors
self.embedded_speakers = None
self.embedded_speakers_projected = None
# global style token
if self.gst and self.use_gst:
self.decoder_in_features += self.gst.gst_embedding_dim # add gst embedding dim
self.gst_layer = None
# Capacitron
if self.capacitron_vae and self.use_capacitron_vae:
self.decoder_in_features += self.capacitron_vae.capacitron_VAE_embedding_dim # add capacitron embedding dim
self.capacitron_vae_layer = None
# additional layers
self.decoder_backward = None
self.coarse_decoder = None
@staticmethod
def _format_aux_input(aux_input: Dict) -> Dict:
"""Set missing fields to their default values"""
if aux_input:
return format_aux_input({"d_vectors": None, "speaker_ids": None}, aux_input)
return None
#############################
# INIT FUNCTIONS
#############################
def _init_backward_decoder(self):
"""Init the backward decoder for Forward-Backward decoding."""
self.decoder_backward = copy.deepcopy(self.decoder)
def _init_coarse_decoder(self):
"""Init the coarse decoder for Double-Decoder Consistency."""
self.coarse_decoder = copy.deepcopy(self.decoder)
self.coarse_decoder.r_init = self.ddc_r
self.coarse_decoder.set_r(self.ddc_r)
#############################
# CORE FUNCTIONS
#############################
@abstractmethod
def forward(self):
pass
@abstractmethod
def inference(self):
pass
def load_checkpoint(
self, config, checkpoint_path, eval=False, cache=False
): # pylint: disable=unused-argument, redefined-builtin
"""Load model checkpoint and set up internals.
Args:
config (Coqpi): model configuration.
checkpoint_path (str): path to checkpoint file.
eval (bool, optional): whether to load model for evaluation.
cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
"""
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
self.load_state_dict(state["model"])
# TODO: set r in run-time by taking it from the new config
if "r" in state:
# set r from the state (for compatibility with older checkpoints)
self.decoder.set_r(state["r"])
elif "config" in state:
# set r from config used at training time (for inference)
self.decoder.set_r(state["config"]["r"])
else:
# set r from the new config (for new-models)
self.decoder.set_r(config.r)
if eval:
self.eval()
print(f" > Model's reduction rate `r` is set to: {self.decoder.r}")
assert not self.training
def get_criterion(self) -> nn.Module:
"""Get the model criterion used in training."""
return TacotronLoss(self.config)
@staticmethod
def init_from_config(config: Coqpit):
"""Initialize model from config."""
from TTS.utils.audio import AudioProcessor
ap = AudioProcessor.init_from_config(config)
tokenizer = TTSTokenizer.init_from_config(config)
speaker_manager = SpeakerManager.init_from_config(config)
return BaseTacotron(config, ap, tokenizer, speaker_manager)
##########################
# TEST AND LOG FUNCTIONS #
##########################
def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
"""Generic test run for `tts` models used by `Trainer`.
You can override this for a different behaviour.
Args:
assets (dict): A dict of training assets. For `tts` models, it must include `{'audio_processor': ap}`.
Returns:
Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard.
"""
print(" | > Synthesizing test sentences.")
test_audios = {}
test_figures = {}
test_sentences = self.config.test_sentences
aux_inputs = self._get_test_aux_input()
for idx, sen in enumerate(test_sentences):
outputs_dict = synthesis(
self,
sen,
self.config,
"cuda" in str(next(self.parameters()).device),
speaker_id=aux_inputs["speaker_id"],
d_vector=aux_inputs["d_vector"],
style_wav=aux_inputs["style_wav"],
use_griffin_lim=True,
do_trim_silence=False,
)
test_audios["{}-audio".format(idx)] = outputs_dict["wav"]
test_figures["{}-prediction".format(idx)] = plot_spectrogram(
outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False
)
test_figures["{}-alignment".format(idx)] = plot_alignment(
outputs_dict["outputs"]["alignments"], output_fig=False
)
return {"figures": test_figures, "audios": test_audios}
def test_log(
self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument
) -> None:
logger.test_audios(steps, outputs["audios"], self.ap.sample_rate)
logger.test_figures(steps, outputs["figures"])
#############################
# COMMON COMPUTE FUNCTIONS
#############################
def compute_masks(self, text_lengths, mel_lengths):
"""Compute masks against sequence paddings."""
# B x T_in_max (boolean)
input_mask = sequence_mask(text_lengths)
output_mask = None
if mel_lengths is not None:
max_len = mel_lengths.max()
r = self.decoder.r
max_len = max_len + (r - (max_len % r)) if max_len % r > 0 else max_len
output_mask = sequence_mask(mel_lengths, max_len=max_len)
return input_mask, output_mask
def _backward_pass(self, mel_specs, encoder_outputs, mask):
"""Run backwards decoder"""
decoder_outputs_b, alignments_b, _ = self.decoder_backward(
encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask
)
decoder_outputs_b = decoder_outputs_b.transpose(1, 2).contiguous()
return decoder_outputs_b, alignments_b
def _coarse_decoder_pass(self, mel_specs, encoder_outputs, alignments, input_mask):
"""Double Decoder Consistency"""
T = mel_specs.shape[1]
if T % self.coarse_decoder.r > 0:
padding_size = self.coarse_decoder.r - (T % self.coarse_decoder.r)
mel_specs = torch.nn.functional.pad(mel_specs, (0, 0, 0, padding_size, 0, 0))
decoder_outputs_backward, alignments_backward, _ = self.coarse_decoder(
encoder_outputs.detach(), mel_specs, input_mask
)
# scale_factor = self.decoder.r_init / self.decoder.r
alignments_backward = torch.nn.functional.interpolate(
alignments_backward.transpose(1, 2),
size=alignments.shape[1],
mode="nearest",
).transpose(1, 2)
decoder_outputs_backward = decoder_outputs_backward.transpose(1, 2)
decoder_outputs_backward = decoder_outputs_backward[:, :T, :]
return decoder_outputs_backward, alignments_backward
#############################
# EMBEDDING FUNCTIONS
#############################
def compute_gst(self, inputs, style_input, speaker_embedding=None):
"""Compute global style token"""
if isinstance(style_input, dict):
# multiply each style token with a weight
query = torch.zeros(1, 1, self.gst.gst_embedding_dim // 2).type_as(inputs)
if speaker_embedding is not None:
query = torch.cat([query, speaker_embedding.reshape(1, 1, -1)], dim=-1)
_GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens)
gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).type_as(inputs)
for k_token, v_amplifier in style_input.items():
key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1)
gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key)
gst_outputs = gst_outputs + gst_outputs_att * v_amplifier
elif style_input is None:
# ignore style token and return zero tensor
gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).type_as(inputs)
else:
# compute style tokens
gst_outputs = self.gst_layer(style_input, speaker_embedding) # pylint: disable=not-callable
inputs = self._concat_speaker_embedding(inputs, gst_outputs)
return inputs
def compute_capacitron_VAE_embedding(self, inputs, reference_mel_info, text_info=None, speaker_embedding=None):
"""Capacitron Variational Autoencoder"""
(
VAE_outputs,
posterior_distribution,
prior_distribution,
capacitron_beta,
) = self.capacitron_vae_layer(
reference_mel_info,
text_info,
speaker_embedding, # pylint: disable=not-callable
)
VAE_outputs = VAE_outputs.to(inputs.device)
encoder_output = self._concat_speaker_embedding(
inputs, VAE_outputs
) # concatenate to the output of the basic tacotron encoder
return (
encoder_output,
posterior_distribution,
prior_distribution,
capacitron_beta,
)
@staticmethod
def _add_speaker_embedding(outputs, embedded_speakers):
embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1)
outputs = outputs + embedded_speakers_
return outputs
@staticmethod
def _concat_speaker_embedding(outputs, embedded_speakers):
embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1)
outputs = torch.cat([outputs, embedded_speakers_], dim=-1)
return outputs
#############################
# CALLBACKS
#############################
def on_epoch_start(self, trainer):
"""Callback for setting values wrt gradual training schedule.
Args:
trainer (TrainerTTS): TTS trainer object that is used to train this model.
"""
if self.gradual_training:
r, trainer.config.batch_size = gradual_training_scheduler(trainer.total_steps_done, trainer.config)
trainer.config.r = r
self.decoder.set_r(r)
if trainer.config.bidirectional_decoder:
trainer.model.decoder_backward.set_r(r)
print(f"\n > Number of output frames: {self.decoder.r}")
+459
View File
@@ -0,0 +1,459 @@
import os
import random
from typing import Dict, List, Tuple, Union
import torch
import torch.distributed as dist
from coqpit import Coqpit
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data.sampler import WeightedRandomSampler
from trainer.torch import DistributedSampler, DistributedSamplerWrapper
from TTS.model import BaseTrainerModel
from TTS.tts.datasets.dataset import TTSDataset
from TTS.tts.utils.data import get_length_balancer_weights
from TTS.tts.utils.languages import LanguageManager, get_language_balancer_weights
from TTS.tts.utils.speakers import SpeakerManager, get_speaker_balancer_weights, get_speaker_manager
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
# pylint: skip-file
class BaseTTS(BaseTrainerModel):
"""Base `tts` class. Every new `tts` model must inherit this.
It defines common `tts` specific functions on top of `Model` implementation.
"""
MODEL_TYPE = "tts"
def __init__(
self,
config: Coqpit,
ap: "AudioProcessor",
tokenizer: "TTSTokenizer",
speaker_manager: SpeakerManager = None,
language_manager: LanguageManager = None,
):
super().__init__()
self.config = config
self.ap = ap
self.tokenizer = tokenizer
self.speaker_manager = speaker_manager
self.language_manager = language_manager
self._set_model_args(config)
def _set_model_args(self, config: Coqpit):
"""Setup model args based on the config type (`ModelConfig` or `ModelArgs`).
`ModelArgs` has all the fields reuqired to initialize the model architecture.
`ModelConfig` has all the fields required for training, inference and containes `ModelArgs`.
If the config is for training with a name like "*Config", then the model args are embeded in the
config.model_args
If the config is for the model with a name like "*Args", then we assign the directly.
"""
# don't use isintance not to import recursively
if "Config" in config.__class__.__name__:
config_num_chars = (
self.config.model_args.num_chars if hasattr(self.config, "model_args") else self.config.num_chars
)
num_chars = config_num_chars if self.tokenizer is None else self.tokenizer.characters.num_chars
if "characters" in config:
self.config.num_chars = num_chars
if hasattr(self.config, "model_args"):
config.model_args.num_chars = num_chars
self.args = self.config.model_args
else:
self.config = config
self.args = config.model_args
elif "Args" in config.__class__.__name__:
self.args = config
else:
raise ValueError("config must be either a *Config or *Args")
def init_multispeaker(self, config: Coqpit, data: List = None):
"""Initialize a speaker embedding layer if needen and define expected embedding channel size for defining
`in_channels` size of the connected layers.
This implementation yields 3 possible outcomes:
1. If `config.use_speaker_embedding` and `config.use_d_vector_file are False, do nothing.
2. If `config.use_d_vector_file` is True, set expected embedding channel size to `config.d_vector_dim` or 512.
3. If `config.use_speaker_embedding`, initialize a speaker embedding layer with channel size of
`config.d_vector_dim` or 512.
You can override this function for new models.
Args:
config (Coqpit): Model configuration.
"""
# set number of speakers
if self.speaker_manager is not None:
self.num_speakers = self.speaker_manager.num_speakers
elif hasattr(config, "num_speakers"):
self.num_speakers = config.num_speakers
# set ultimate speaker embedding size
if config.use_speaker_embedding or config.use_d_vector_file:
self.embedded_speaker_dim = (
config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512
)
# init speaker embedding layer
if config.use_speaker_embedding and not config.use_d_vector_file:
print(" > Init speaker_embedding layer.")
self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim)
self.speaker_embedding.weight.data.normal_(0, 0.3)
def get_aux_input(self, **kwargs) -> Dict:
"""Prepare and return `aux_input` used by `forward()`"""
return {"speaker_id": None, "style_wav": None, "d_vector": None, "language_id": None}
def get_aux_input_from_test_sentences(self, sentence_info):
if hasattr(self.config, "model_args"):
config = self.config.model_args
else:
config = self.config
# extract speaker and language info
text, speaker_name, style_wav, language_name = None, None, None, None
if isinstance(sentence_info, list):
if len(sentence_info) == 1:
text = sentence_info[0]
elif len(sentence_info) == 2:
text, speaker_name = sentence_info
elif len(sentence_info) == 3:
text, speaker_name, style_wav = sentence_info
elif len(sentence_info) == 4:
text, speaker_name, style_wav, language_name = sentence_info
else:
text = sentence_info
# get speaker id/d_vector
speaker_id, d_vector, language_id = None, None, None
if self.speaker_manager is not None:
if config.use_d_vector_file:
if speaker_name is None:
d_vector = self.speaker_manager.get_random_embedding()
else:
d_vector = self.speaker_manager.get_d_vector_by_name(speaker_name)
elif config.use_speaker_embedding:
if speaker_name is None:
speaker_id = self.speaker_manager.get_random_id()
else:
speaker_id = self.speaker_manager.name_to_id[speaker_name]
# get language id
if self.language_manager is not None and config.use_language_embedding and language_name is not None:
language_id = self.language_manager.name_to_id[language_name]
return {
"text": text,
"speaker_id": speaker_id,
"style_wav": style_wav,
"d_vector": d_vector,
"language_id": language_id,
}
def format_batch(self, batch: Dict) -> Dict:
"""Generic batch formatting for `TTSDataset`.
You must override this if you use a custom dataset.
Args:
batch (Dict): [description]
Returns:
Dict: [description]
"""
# setup input batch
text_input = batch["token_id"]
text_lengths = batch["token_id_lengths"]
speaker_names = batch["speaker_names"]
linear_input = batch["linear"]
mel_input = batch["mel"]
mel_lengths = batch["mel_lengths"]
stop_targets = batch["stop_targets"]
item_idx = batch["item_idxs"]
d_vectors = batch["d_vectors"]
speaker_ids = batch["speaker_ids"]
attn_mask = batch["attns"]
waveform = batch["waveform"]
pitch = batch["pitch"]
energy = batch["energy"]
language_ids = batch["language_ids"]
max_text_length = torch.max(text_lengths.float())
max_spec_length = torch.max(mel_lengths.float())
# compute durations from attention masks
durations = None
if attn_mask is not None:
durations = torch.zeros(attn_mask.shape[0], attn_mask.shape[2])
for idx, am in enumerate(attn_mask):
# compute raw durations
c_idxs = am[:, : text_lengths[idx], : mel_lengths[idx]].max(1)[1]
# c_idxs, counts = torch.unique_consecutive(c_idxs, return_counts=True)
c_idxs, counts = torch.unique(c_idxs, return_counts=True)
dur = torch.ones([text_lengths[idx]]).to(counts.dtype)
dur[c_idxs] = counts
# smooth the durations and set any 0 duration to 1
# by cutting off from the largest duration indeces.
extra_frames = dur.sum() - mel_lengths[idx]
largest_idxs = torch.argsort(-dur)[:extra_frames]
dur[largest_idxs] -= 1
assert (
dur.sum() == mel_lengths[idx]
), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}"
durations[idx, : text_lengths[idx]] = dur
# set stop targets wrt reduction factor
stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // self.config.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2)
stop_target_lengths = torch.divide(mel_lengths, self.config.r).ceil_()
return {
"text_input": text_input,
"text_lengths": text_lengths,
"speaker_names": speaker_names,
"mel_input": mel_input,
"mel_lengths": mel_lengths,
"linear_input": linear_input,
"stop_targets": stop_targets,
"stop_target_lengths": stop_target_lengths,
"attn_mask": attn_mask,
"durations": durations,
"speaker_ids": speaker_ids,
"d_vectors": d_vectors,
"max_text_length": float(max_text_length),
"max_spec_length": float(max_spec_length),
"item_idx": item_idx,
"waveform": waveform,
"pitch": pitch,
"energy": energy,
"language_ids": language_ids,
"audio_unique_names": batch["audio_unique_names"],
}
def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1):
weights = None
data_items = dataset.samples
if getattr(config, "use_language_weighted_sampler", False):
alpha = getattr(config, "language_weighted_sampler_alpha", 1.0)
print(" > Using Language weighted sampler with alpha:", alpha)
weights = get_language_balancer_weights(data_items) * alpha
if getattr(config, "use_speaker_weighted_sampler", False):
alpha = getattr(config, "speaker_weighted_sampler_alpha", 1.0)
print(" > Using Speaker weighted sampler with alpha:", alpha)
if weights is not None:
weights += get_speaker_balancer_weights(data_items) * alpha
else:
weights = get_speaker_balancer_weights(data_items) * alpha
if getattr(config, "use_length_weighted_sampler", False):
alpha = getattr(config, "length_weighted_sampler_alpha", 1.0)
print(" > Using Length weighted sampler with alpha:", alpha)
if weights is not None:
weights += get_length_balancer_weights(data_items) * alpha
else:
weights = get_length_balancer_weights(data_items) * alpha
if weights is not None:
sampler = WeightedRandomSampler(weights, len(weights))
else:
sampler = None
# sampler for DDP
if sampler is None:
sampler = DistributedSampler(dataset) if num_gpus > 1 else None
else: # If a sampler is already defined use this sampler and DDP sampler together
sampler = DistributedSamplerWrapper(sampler) if num_gpus > 1 else sampler
return sampler
def get_data_loader(
self,
config: Coqpit,
assets: Dict,
is_eval: bool,
samples: Union[List[Dict], List[List]],
verbose: bool,
num_gpus: int,
rank: int = None,
) -> "DataLoader":
if is_eval and not config.run_eval:
loader = None
else:
# setup multi-speaker attributes
if self.speaker_manager is not None:
if hasattr(config, "model_args"):
speaker_id_mapping = (
self.speaker_manager.name_to_id if config.model_args.use_speaker_embedding else None
)
d_vector_mapping = self.speaker_manager.embeddings if config.model_args.use_d_vector_file else None
config.use_d_vector_file = config.model_args.use_d_vector_file
else:
speaker_id_mapping = self.speaker_manager.name_to_id if config.use_speaker_embedding else None
d_vector_mapping = self.speaker_manager.embeddings if config.use_d_vector_file else None
else:
speaker_id_mapping = None
d_vector_mapping = None
# setup multi-lingual attributes
if self.language_manager is not None:
language_id_mapping = self.language_manager.name_to_id if self.args.use_language_embedding else None
else:
language_id_mapping = None
# init dataloader
dataset = TTSDataset(
outputs_per_step=config.r if "r" in config else 1,
compute_linear_spec=config.model.lower() == "tacotron" or config.compute_linear_spec,
compute_f0=config.get("compute_f0", False),
f0_cache_path=config.get("f0_cache_path", None),
compute_energy=config.get("compute_energy", False),
energy_cache_path=config.get("energy_cache_path", None),
samples=samples,
ap=self.ap,
return_wav=config.return_wav if "return_wav" in config else False,
batch_group_size=0 if is_eval else config.batch_group_size * config.batch_size,
min_text_len=config.min_text_len,
max_text_len=config.max_text_len,
min_audio_len=config.min_audio_len,
max_audio_len=config.max_audio_len,
phoneme_cache_path=config.phoneme_cache_path,
precompute_num_workers=config.precompute_num_workers,
use_noise_augment=False if is_eval else config.use_noise_augment,
verbose=verbose,
speaker_id_mapping=speaker_id_mapping,
d_vector_mapping=d_vector_mapping if config.use_d_vector_file else None,
tokenizer=self.tokenizer,
start_by_longest=config.start_by_longest,
language_id_mapping=language_id_mapping,
)
# wait all the DDP process to be ready
if num_gpus > 1:
dist.barrier()
# sort input sequences from short to long
dataset.preprocess_samples()
# get samplers
sampler = self.get_sampler(config, dataset, num_gpus)
loader = DataLoader(
dataset,
batch_size=config.eval_batch_size if is_eval else config.batch_size,
shuffle=config.shuffle if sampler is None else False, # if there is no other sampler
collate_fn=dataset.collate_fn,
drop_last=config.drop_last, # setting this False might cause issues in AMP training.
sampler=sampler,
num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers,
pin_memory=False,
)
return loader
def _get_test_aux_input(
self,
) -> Dict:
d_vector = None
if self.config.use_d_vector_file:
d_vector = [self.speaker_manager.embeddings[name]["embedding"] for name in self.speaker_manager.embeddings]
d_vector = (random.sample(sorted(d_vector), 1),)
aux_inputs = {
"speaker_id": None
if not self.config.use_speaker_embedding
else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1),
"d_vector": d_vector,
"style_wav": None, # TODO: handle GST style input
}
return aux_inputs
def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
"""Generic test run for `tts` models used by `Trainer`.
You can override this for a different behaviour.
Args:
assets (dict): A dict of training assets. For `tts` models, it must include `{'audio_processor': ap}`.
Returns:
Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard.
"""
print(" | > Synthesizing test sentences.")
test_audios = {}
test_figures = {}
test_sentences = self.config.test_sentences
aux_inputs = self._get_test_aux_input()
for idx, sen in enumerate(test_sentences):
if isinstance(sen, list):
aux_inputs = self.get_aux_input_from_test_sentences(sen)
sen = aux_inputs["text"]
outputs_dict = synthesis(
self,
sen,
self.config,
"cuda" in str(next(self.parameters()).device),
speaker_id=aux_inputs["speaker_id"],
d_vector=aux_inputs["d_vector"],
style_wav=aux_inputs["style_wav"],
use_griffin_lim=True,
do_trim_silence=False,
)
test_audios["{}-audio".format(idx)] = outputs_dict["wav"]
test_figures["{}-prediction".format(idx)] = plot_spectrogram(
outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False
)
test_figures["{}-alignment".format(idx)] = plot_alignment(
outputs_dict["outputs"]["alignments"], output_fig=False
)
return test_figures, test_audios
def on_init_start(self, trainer):
"""Save the speaker.pth and language_ids.json at the beginning of the training. Also update both paths."""
if self.speaker_manager is not None:
output_path = os.path.join(trainer.output_path, "speakers.pth")
self.speaker_manager.save_ids_to_file(output_path)
trainer.config.speakers_file = output_path
# some models don't have `model_args` set
if hasattr(trainer.config, "model_args"):
trainer.config.model_args.speakers_file = output_path
trainer.config.save_json(os.path.join(trainer.output_path, "config.json"))
print(f" > `speakers.pth` is saved to {output_path}.")
print(" > `speakers_file` is updated in the config.json.")
if self.language_manager is not None:
output_path = os.path.join(trainer.output_path, "language_ids.json")
self.language_manager.save_ids_to_file(output_path)
trainer.config.language_ids_file = output_path
if hasattr(trainer.config, "model_args"):
trainer.config.model_args.language_ids_file = output_path
trainer.config.save_json(os.path.join(trainer.output_path, "config.json"))
print(f" > `language_ids.json` is saved to {output_path}.")
print(" > `language_ids_file` is updated in the config.json.")
class BaseTTSE2E(BaseTTS):
def _set_model_args(self, config: Coqpit):
self.config = config
if "Config" in config.__class__.__name__:
num_chars = (
self.config.model_args.num_chars if self.tokenizer is None else self.tokenizer.characters.num_chars
)
self.config.model_args.num_chars = num_chars
self.config.num_chars = num_chars
self.args = config.model_args
self.args.num_chars = num_chars
elif "Args" in config.__class__.__name__:
self.args = config
self.args.num_chars = self.args.num_chars
else:
raise ValueError("config must be either a *Config or *Args")
File diff suppressed because it is too large Load Diff
+862
View File
@@ -0,0 +1,862 @@
from dataclasses import dataclass, field
from typing import Dict, List, Tuple, Union
import torch
from coqpit import Coqpit
from torch import nn
from torch.cuda.amp.autocast_mode import autocast
from TTS.tts.layers.feed_forward.decoder import Decoder
from TTS.tts.layers.feed_forward.encoder import Encoder
from TTS.tts.layers.generic.aligner import AlignmentNetwork
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.helpers import average_over_durations, generate_path, maximum_path, sequence_mask
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.visual import plot_alignment, plot_avg_energy, plot_avg_pitch, plot_spectrogram
from TTS.utils.io import load_fsspec
@dataclass
class ForwardTTSArgs(Coqpit):
"""ForwardTTS Model arguments.
Args:
num_chars (int):
Number of characters in the vocabulary. Defaults to 100.
out_channels (int):
Number of output channels. Defaults to 80.
hidden_channels (int):
Number of base hidden channels of the model. Defaults to 512.
use_aligner (bool):
Whether to use aligner network to learn the text to speech alignment or use pre-computed durations.
If set False, durations should be computed by `TTS/bin/compute_attention_masks.py` and path to the
pre-computed durations must be provided to `config.datasets[0].meta_file_attn_mask`. Defaults to True.
use_pitch (bool):
Use pitch predictor to learn the pitch. Defaults to True.
use_energy (bool):
Use energy predictor to learn the energy. Defaults to True.
duration_predictor_hidden_channels (int):
Number of hidden channels in the duration predictor. Defaults to 256.
duration_predictor_dropout_p (float):
Dropout rate for the duration predictor. Defaults to 0.1.
duration_predictor_kernel_size (int):
Kernel size of conv layers in the duration predictor. Defaults to 3.
pitch_predictor_hidden_channels (int):
Number of hidden channels in the pitch predictor. Defaults to 256.
pitch_predictor_dropout_p (float):
Dropout rate for the pitch predictor. Defaults to 0.1.
pitch_predictor_kernel_size (int):
Kernel size of conv layers in the pitch predictor. Defaults to 3.
pitch_embedding_kernel_size (int):
Kernel size of the projection layer in the pitch predictor. Defaults to 3.
energy_predictor_hidden_channels (int):
Number of hidden channels in the energy predictor. Defaults to 256.
energy_predictor_dropout_p (float):
Dropout rate for the energy predictor. Defaults to 0.1.
energy_predictor_kernel_size (int):
Kernel size of conv layers in the energy predictor. Defaults to 3.
energy_embedding_kernel_size (int):
Kernel size of the projection layer in the energy predictor. Defaults to 3.
positional_encoding (bool):
Whether to use positional encoding. Defaults to True.
positional_encoding_use_scale (bool):
Whether to use a learnable scale coeff in the positional encoding. Defaults to True.
length_scale (int):
Length scale that multiplies the predicted durations. Larger values result slower speech. Defaults to 1.0.
encoder_type (str):
Type of the encoder module. One of the encoders available in :class:`TTS.tts.layers.feed_forward.encoder`.
Defaults to `fftransformer` as in the paper.
encoder_params (dict):
Parameters of the encoder module. Defaults to ```{"hidden_channels_ffn": 1024, "num_heads": 1, "num_layers": 6, "dropout_p": 0.1}```
decoder_type (str):
Type of the decoder module. One of the decoders available in :class:`TTS.tts.layers.feed_forward.decoder`.
Defaults to `fftransformer` as in the paper.
decoder_params (str):
Parameters of the decoder module. Defaults to ```{"hidden_channels_ffn": 1024, "num_heads": 1, "num_layers": 6, "dropout_p": 0.1}```
detach_duration_predictor (bool):
Detach the input to the duration predictor from the earlier computation graph so that the duraiton loss
does not pass to the earlier layers. Defaults to True.
max_duration (int):
Maximum duration accepted by the model. Defaults to 75.
num_speakers (int):
Number of speakers for the speaker embedding layer. Defaults to 0.
speakers_file (str):
Path to the speaker mapping file for the Speaker Manager. Defaults to None.
speaker_embedding_channels (int):
Number of speaker embedding channels. Defaults to 256.
use_d_vector_file (bool):
Enable/Disable the use of d-vectors for multi-speaker training. Defaults to False.
d_vector_dim (int):
Number of d-vector channels. Defaults to 0.
"""
num_chars: int = None
out_channels: int = 80
hidden_channels: int = 384
use_aligner: bool = True
# pitch params
use_pitch: bool = True
pitch_predictor_hidden_channels: int = 256
pitch_predictor_kernel_size: int = 3
pitch_predictor_dropout_p: float = 0.1
pitch_embedding_kernel_size: int = 3
# energy params
use_energy: bool = False
energy_predictor_hidden_channels: int = 256
energy_predictor_kernel_size: int = 3
energy_predictor_dropout_p: float = 0.1
energy_embedding_kernel_size: int = 3
# duration params
duration_predictor_hidden_channels: int = 256
duration_predictor_kernel_size: int = 3
duration_predictor_dropout_p: float = 0.1
positional_encoding: bool = True
poisitonal_encoding_use_scale: bool = True
length_scale: int = 1
encoder_type: str = "fftransformer"
encoder_params: dict = field(
default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 1, "num_layers": 6, "dropout_p": 0.1}
)
decoder_type: str = "fftransformer"
decoder_params: dict = field(
default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 1, "num_layers": 6, "dropout_p": 0.1}
)
detach_duration_predictor: bool = False
max_duration: int = 75
num_speakers: int = 1
use_speaker_embedding: bool = False
speakers_file: str = None
use_d_vector_file: bool = False
d_vector_dim: int = None
d_vector_file: str = None
class ForwardTTS(BaseTTS):
"""General forward TTS model implementation that uses an encoder-decoder architecture with an optional alignment
network and a pitch predictor.
If the alignment network is used, the model learns the text-to-speech alignment
from the data instead of using pre-computed durations.
If the pitch predictor is used, the model trains a pitch predictor that predicts average pitch value for each
input character as in the FastPitch model.
`ForwardTTS` can be configured to one of these architectures,
- FastPitch
- SpeedySpeech
- FastSpeech
- FastSpeech2 (requires average speech energy predictor)
Args:
config (Coqpit): Model coqpit class.
speaker_manager (SpeakerManager): Speaker manager for multi-speaker training. Only used for multi-speaker models.
Defaults to None.
Examples:
>>> from TTS.tts.models.fast_pitch import ForwardTTS, ForwardTTSArgs
>>> config = ForwardTTSArgs()
>>> model = ForwardTTS(config)
"""
# pylint: disable=dangerous-default-value
def __init__(
self,
config: Coqpit,
ap: "AudioProcessor" = None,
tokenizer: "TTSTokenizer" = None,
speaker_manager: SpeakerManager = None,
):
super().__init__(config, ap, tokenizer, speaker_manager)
self._set_model_args(config)
self.init_multispeaker(config)
self.max_duration = self.args.max_duration
self.use_aligner = self.args.use_aligner
self.use_pitch = self.args.use_pitch
self.use_energy = self.args.use_energy
self.binary_loss_weight = 0.0
self.length_scale = (
float(self.args.length_scale) if isinstance(self.args.length_scale, int) else self.args.length_scale
)
self.emb = nn.Embedding(self.args.num_chars, self.args.hidden_channels)
self.encoder = Encoder(
self.args.hidden_channels,
self.args.hidden_channels,
self.args.encoder_type,
self.args.encoder_params,
self.embedded_speaker_dim,
)
if self.args.positional_encoding:
self.pos_encoder = PositionalEncoding(self.args.hidden_channels)
self.decoder = Decoder(
self.args.out_channels,
self.args.hidden_channels,
self.args.decoder_type,
self.args.decoder_params,
)
self.duration_predictor = DurationPredictor(
self.args.hidden_channels,
self.args.duration_predictor_hidden_channels,
self.args.duration_predictor_kernel_size,
self.args.duration_predictor_dropout_p,
)
if self.args.use_pitch:
self.pitch_predictor = DurationPredictor(
self.args.hidden_channels,
self.args.pitch_predictor_hidden_channels,
self.args.pitch_predictor_kernel_size,
self.args.pitch_predictor_dropout_p,
)
self.pitch_emb = nn.Conv1d(
1,
self.args.hidden_channels,
kernel_size=self.args.pitch_embedding_kernel_size,
padding=int((self.args.pitch_embedding_kernel_size - 1) / 2),
)
if self.args.use_energy:
self.energy_predictor = DurationPredictor(
self.args.hidden_channels,
self.args.energy_predictor_hidden_channels,
self.args.energy_predictor_kernel_size,
self.args.energy_predictor_dropout_p,
)
self.energy_emb = nn.Conv1d(
1,
self.args.hidden_channels,
kernel_size=self.args.energy_embedding_kernel_size,
padding=int((self.args.energy_embedding_kernel_size - 1) / 2),
)
if self.args.use_aligner:
self.aligner = AlignmentNetwork(
in_query_channels=self.args.out_channels, in_key_channels=self.args.hidden_channels
)
def init_multispeaker(self, config: Coqpit):
"""Init for multi-speaker training.
Args:
config (Coqpit): Model configuration.
"""
self.embedded_speaker_dim = 0
# init speaker manager
if self.speaker_manager is None and (config.use_d_vector_file or config.use_speaker_embedding):
raise ValueError(
" > SpeakerManager is not provided. You must provide the SpeakerManager before initializing a multi-speaker model."
)
# set number of speakers
if self.speaker_manager is not None:
self.num_speakers = self.speaker_manager.num_speakers
# init d-vector embedding
if config.use_d_vector_file:
self.embedded_speaker_dim = config.d_vector_dim
if self.args.d_vector_dim != self.args.hidden_channels:
#self.proj_g = nn.Conv1d(self.args.d_vector_dim, self.args.hidden_channels, 1)
self.proj_g = nn.Linear(in_features=self.args.d_vector_dim, out_features=self.args.hidden_channels)
# init speaker embedding layer
if config.use_speaker_embedding and not config.use_d_vector_file:
print(" > Init speaker_embedding layer.")
self.emb_g = nn.Embedding(self.num_speakers, self.args.hidden_channels)
nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)
@staticmethod
def generate_attn(dr, x_mask, y_mask=None):
"""Generate an attention mask from the durations.
Shapes
- dr: :math:`(B, T_{en})`
- x_mask: :math:`(B, T_{en})`
- y_mask: :math:`(B, T_{de})`
"""
# compute decode mask from the durations
if y_mask is None:
y_lengths = dr.sum(1).long()
y_lengths[y_lengths < 1] = 1
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(dr.dtype)
attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
attn = generate_path(dr, attn_mask.squeeze(1)).to(dr.dtype)
return attn
def expand_encoder_outputs(self, en, dr, x_mask, y_mask):
"""Generate attention alignment map from durations and
expand encoder outputs
Shapes:
- en: :math:`(B, D_{en}, T_{en})`
- dr: :math:`(B, T_{en})`
- x_mask: :math:`(B, T_{en})`
- y_mask: :math:`(B, T_{de})`
Examples::
encoder output: [a,b,c,d]
durations: [1, 3, 2, 1]
expanded: [a, b, b, b, c, c, d]
attention map: [[0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 1, 1, 0],
[0, 1, 1, 1, 0, 0, 0],
[1, 0, 0, 0, 0, 0, 0]]
"""
attn = self.generate_attn(dr, x_mask, y_mask)
o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2).to(en.dtype), en.transpose(1, 2)).transpose(1, 2)
return o_en_ex, attn
def format_durations(self, o_dr_log, x_mask):
"""Format predicted durations.
1. Convert to linear scale from log scale
2. Apply the length scale for speed adjustment
3. Apply masking.
4. Cast 0 durations to 1.
5. Round the duration values.
Args:
o_dr_log: Log scale durations.
x_mask: Input text mask.
Shapes:
- o_dr_log: :math:`(B, T_{de})`
- x_mask: :math:`(B, T_{en})`
"""
o_dr = (torch.exp(o_dr_log) - 1) * x_mask * self.length_scale
o_dr[o_dr < 1] = 1.0
o_dr = torch.round(o_dr)
return o_dr
def _forward_encoder(
self, x: torch.LongTensor, x_mask: torch.FloatTensor, g: torch.FloatTensor = None
) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
"""Encoding forward pass.
1. Embed speaker IDs if multi-speaker mode.
2. Embed character sequences.
3. Run the encoder network.
4. Sum encoder outputs and speaker embeddings
Args:
x (torch.LongTensor): Input sequence IDs.
x_mask (torch.FloatTensor): Input squence mask.
g (torch.FloatTensor, optional): Conditioning vectors. In general speaker embeddings. Defaults to None.
Returns:
Tuple[torch.tensor, torch.tensor, torch.tensor, torch.tensor, torch.tensor]:
encoder output, encoder output for the duration predictor, input sequence mask, speaker embeddings,
character embeddings
Shapes:
- x: :math:`(B, T_{en})`
- x_mask: :math:`(B, 1, T_{en})`
- g: :math:`(B, C)`
"""
if hasattr(self, "emb_g"):
g = g.type(torch.LongTensor)
g = self.emb_g(g) # [B, C, 1]
if g is not None:
g = g.unsqueeze(-1)
# [B, T, C]
x_emb = self.emb(x)
# encoder pass
#o_en = self.encoder(torch.transpose(x_emb, 1, -1), x_mask)
o_en = self.encoder(torch.transpose(x_emb, 1, -1), x_mask, g)
# speaker conditioning
# TODO: try different ways of conditioning
if g is not None:
if hasattr(self, "proj_g"):
g = self.proj_g(g.view(g.shape[0], -1)).unsqueeze(-1)
o_en = o_en + g
return o_en, x_mask, g, x_emb
def _forward_decoder(
self,
o_en: torch.FloatTensor,
dr: torch.IntTensor,
x_mask: torch.FloatTensor,
y_lengths: torch.IntTensor,
g: torch.FloatTensor,
) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
"""Decoding forward pass.
1. Compute the decoder output mask
2. Expand encoder output with the durations.
3. Apply position encoding.
4. Add speaker embeddings if multi-speaker mode.
5. Run the decoder.
Args:
o_en (torch.FloatTensor): Encoder output.
dr (torch.IntTensor): Ground truth durations or alignment network durations.
x_mask (torch.IntTensor): Input sequence mask.
y_lengths (torch.IntTensor): Output sequence lengths.
g (torch.FloatTensor): Conditioning vectors. In general speaker embeddings.
Returns:
Tuple[torch.FloatTensor, torch.FloatTensor]: Decoder output, attention map from durations.
"""
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en.dtype)
# expand o_en with durations
o_en_ex, attn = self.expand_encoder_outputs(o_en, dr, x_mask, y_mask)
# positional encoding
if hasattr(self, "pos_encoder"):
o_en_ex = self.pos_encoder(o_en_ex, y_mask)
# decoder pass
o_de = self.decoder(o_en_ex, y_mask, g=g)
return o_de.transpose(1, 2), attn.transpose(1, 2)
def _forward_pitch_predictor(
self,
o_en: torch.FloatTensor,
x_mask: torch.IntTensor,
pitch: torch.FloatTensor = None,
dr: torch.IntTensor = None,
) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
"""Pitch predictor forward pass.
1. Predict pitch from encoder outputs.
2. In training - Compute average pitch values for each input character from the ground truth pitch values.
3. Embed average pitch values.
Args:
o_en (torch.FloatTensor): Encoder output.
x_mask (torch.IntTensor): Input sequence mask.
pitch (torch.FloatTensor, optional): Ground truth pitch values. Defaults to None.
dr (torch.IntTensor, optional): Ground truth durations. Defaults to None.
Returns:
Tuple[torch.FloatTensor, torch.FloatTensor]: Pitch embedding, pitch prediction.
Shapes:
- o_en: :math:`(B, C, T_{en})`
- x_mask: :math:`(B, 1, T_{en})`
- pitch: :math:`(B, 1, T_{de})`
- dr: :math:`(B, T_{en})`
"""
o_pitch = self.pitch_predictor(o_en, x_mask)
if pitch is not None:
avg_pitch = average_over_durations(pitch, dr)
o_pitch_emb = self.pitch_emb(avg_pitch)
return o_pitch_emb, o_pitch, avg_pitch
o_pitch_emb = self.pitch_emb(o_pitch)
return o_pitch_emb, o_pitch
def _forward_energy_predictor(
self,
o_en: torch.FloatTensor,
x_mask: torch.IntTensor,
energy: torch.FloatTensor = None,
dr: torch.IntTensor = None,
) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
"""Energy predictor forward pass.
1. Predict energy from encoder outputs.
2. In training - Compute average pitch values for each input character from the ground truth pitch values.
3. Embed average energy values.
Args:
o_en (torch.FloatTensor): Encoder output.
x_mask (torch.IntTensor): Input sequence mask.
energy (torch.FloatTensor, optional): Ground truth energy values. Defaults to None.
dr (torch.IntTensor, optional): Ground truth durations. Defaults to None.
Returns:
Tuple[torch.FloatTensor, torch.FloatTensor]: Energy embedding, energy prediction.
Shapes:
- o_en: :math:`(B, C, T_{en})`
- x_mask: :math:`(B, 1, T_{en})`
- pitch: :math:`(B, 1, T_{de})`
- dr: :math:`(B, T_{en})`
"""
o_energy = self.energy_predictor(o_en, x_mask)
if energy is not None:
avg_energy = average_over_durations(energy, dr)
o_energy_emb = self.energy_emb(avg_energy)
return o_energy_emb, o_energy, avg_energy
o_energy_emb = self.energy_emb(o_energy)
return o_energy_emb, o_energy
def _forward_aligner(
self, x: torch.FloatTensor, y: torch.FloatTensor, x_mask: torch.IntTensor, y_mask: torch.IntTensor
) -> Tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
"""Aligner forward pass.
1. Compute a mask to apply to the attention map.
2. Run the alignment network.
3. Apply MAS to compute the hard alignment map.
4. Compute the durations from the hard alignment map.
Args:
x (torch.FloatTensor): Input sequence.
y (torch.FloatTensor): Output sequence.
x_mask (torch.IntTensor): Input sequence mask.
y_mask (torch.IntTensor): Output sequence mask.
Returns:
Tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
Durations from the hard alignment map, soft alignment potentials, log scale alignment potentials,
hard alignment map.
Shapes:
- x: :math:`[B, T_en, C_en]`
- y: :math:`[B, T_de, C_de]`
- x_mask: :math:`[B, 1, T_en]`
- y_mask: :math:`[B, 1, T_de]`
- o_alignment_dur: :math:`[B, T_en]`
- alignment_soft: :math:`[B, T_en, T_de]`
- alignment_logprob: :math:`[B, 1, T_de, T_en]`
- alignment_mas: :math:`[B, T_en, T_de]`
"""
attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
alignment_soft, alignment_logprob = self.aligner(y.transpose(1, 2), x.transpose(1, 2), x_mask, None)
alignment_mas = maximum_path(
alignment_soft.squeeze(1).transpose(1, 2).contiguous(), attn_mask.squeeze(1).contiguous()
)
o_alignment_dur = torch.sum(alignment_mas, -1).int()
alignment_soft = alignment_soft.squeeze(1).transpose(1, 2)
return o_alignment_dur, alignment_soft, alignment_logprob, alignment_mas
def _set_speaker_input(self, aux_input: Dict):
d_vectors = aux_input.get("d_vectors", None)
speaker_ids = aux_input.get("speaker_ids", None)
if d_vectors is not None and speaker_ids is not None:
raise ValueError("[!] Cannot use d-vectors and speaker-ids together.")
if speaker_ids is not None and not hasattr(self, "emb_g"):
raise ValueError("[!] Cannot use speaker-ids without enabling speaker embedding.")
g = speaker_ids if speaker_ids is not None else d_vectors
return g
def forward(
self,
x: torch.LongTensor,
x_lengths: torch.LongTensor,
y_lengths: torch.LongTensor,
y: torch.FloatTensor = None,
dr: torch.IntTensor = None,
pitch: torch.FloatTensor = None,
energy: torch.FloatTensor = None,
aux_input: Dict = {"d_vectors": None, "speaker_ids": None}, # pylint: disable=unused-argument
) -> Dict:
"""Model's forward pass.
Args:
x (torch.LongTensor): Input character sequences.
x_lengths (torch.LongTensor): Input sequence lengths.
y_lengths (torch.LongTensor): Output sequnce lengths. Defaults to None.
y (torch.FloatTensor): Spectrogram frames. Only used when the alignment network is on. Defaults to None.
dr (torch.IntTensor): Character durations over the spectrogram frames. Only used when the alignment network is off. Defaults to None.
pitch (torch.FloatTensor): Pitch values for each spectrogram frame. Only used when the pitch predictor is on. Defaults to None.
energy (torch.FloatTensor): energy values for each spectrogram frame. Only used when the energy predictor is on. Defaults to None.
aux_input (Dict): Auxiliary model inputs for multi-speaker training. Defaults to `{"d_vectors": 0, "speaker_ids": None}`.
Shapes:
- x: :math:`[B, T_max]`
- x_lengths: :math:`[B]`
- y_lengths: :math:`[B]`
- y: :math:`[B, T_max2]`
- dr: :math:`[B, T_max]`
- g: :math:`[B, C]`
- pitch: :math:`[B, 1, T]`
"""
g = self._set_speaker_input(aux_input)
# compute sequence masks
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).float()
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).float()
# encoder pass
o_en, x_mask, g, x_emb = self._forward_encoder(x, x_mask, g)
# duration predictor pass
if self.args.detach_duration_predictor:
o_dr_log = self.duration_predictor(o_en.detach(), x_mask)
else:
o_dr_log = self.duration_predictor(o_en, x_mask)
o_dr = torch.clamp(torch.exp(o_dr_log) - 1, 0, self.max_duration)
# generate attn mask from predicted durations
o_attn = self.generate_attn(o_dr.squeeze(1), x_mask)
# aligner
o_alignment_dur = None
alignment_soft = None
alignment_logprob = None
alignment_mas = None
if self.use_aligner:
o_alignment_dur, alignment_soft, alignment_logprob, alignment_mas = self._forward_aligner(
x_emb, y, x_mask, y_mask
)
alignment_soft = alignment_soft.transpose(1, 2)
alignment_mas = alignment_mas.transpose(1, 2)
dr = o_alignment_dur
# pitch predictor pass
o_pitch = None
avg_pitch = None
if self.args.use_pitch:
o_pitch_emb, o_pitch, avg_pitch = self._forward_pitch_predictor(o_en, x_mask, pitch, dr)
o_en = o_en + o_pitch_emb
# energy predictor pass
o_energy = None
avg_energy = None
if self.args.use_energy:
o_energy_emb, o_energy, avg_energy = self._forward_energy_predictor(o_en, x_mask, energy, dr)
o_en = o_en + o_energy_emb
# decoder pass
o_de, attn = self._forward_decoder(
o_en, dr, x_mask, y_lengths, g=None
) # TODO: maybe pass speaker embedding (g) too
outputs = {
"model_outputs": o_de, # [B, T, C]
"durations_log": o_dr_log.squeeze(1), # [B, T]
"durations": o_dr.squeeze(1), # [B, T]
"attn_durations": o_attn, # for visualization [B, T_en, T_de']
"pitch_avg": o_pitch,
"pitch_avg_gt": avg_pitch,
"energy_avg": o_energy,
"energy_avg_gt": avg_energy,
"alignments": attn, # [B, T_de, T_en]
"alignment_soft": alignment_soft,
"alignment_mas": alignment_mas,
"o_alignment_dur": o_alignment_dur,
"alignment_logprob": alignment_logprob,
"x_mask": x_mask,
"y_mask": y_mask,
}
return outputs
@torch.no_grad()
def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None}): # pylint: disable=unused-argument
"""Model's inference pass.
Args:
x (torch.LongTensor): Input character sequence.
aux_input (Dict): Auxiliary model inputs. Defaults to `{"d_vectors": None, "speaker_ids": None}`.
Shapes:
- x: [B, T_max]
- x_lengths: [B]
- g: [B, C]
"""
g = self._set_speaker_input(aux_input)
x_lengths = torch.tensor(x.shape[1:2]).to(x.device)
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).to(x.dtype).float()
# encoder pass
o_en, x_mask, g, _ = self._forward_encoder(x, x_mask, g)
# duration predictor pass
o_dr_log = self.duration_predictor(o_en.squeeze(), x_mask)
o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1)
y_lengths = o_dr.sum(1)
# pitch predictor pass
o_pitch = None
if self.args.use_pitch:
o_pitch_emb, o_pitch = self._forward_pitch_predictor(o_en, x_mask)
o_en = o_en + o_pitch_emb
# energy predictor pass
o_energy = None
if self.args.use_energy:
o_energy_emb, o_energy = self._forward_energy_predictor(o_en, x_mask)
o_en = o_en + o_energy_emb
# decoder pass
o_de, attn = self._forward_decoder(o_en, o_dr, x_mask, y_lengths, g=None)
outputs = {
"model_outputs": o_de,
"alignments": attn,
"pitch": o_pitch,
"energy": o_energy,
"durations_log": o_dr_log,
}
return outputs
def train_step(self, batch: dict, criterion: nn.Module):
text_input = batch["text_input"]
text_lengths = batch["text_lengths"]
mel_input = batch["mel_input"]
mel_lengths = batch["mel_lengths"]
pitch = batch["pitch"] if self.args.use_pitch else None
energy = batch["energy"] if self.args.use_energy else None
d_vectors = batch["d_vectors"]
speaker_ids = batch["speaker_ids"]
durations = batch["durations"]
aux_input = {"d_vectors": d_vectors, "speaker_ids": speaker_ids}
# forward pass
outputs = self.forward(
text_input,
text_lengths,
mel_lengths,
y=mel_input,
dr=durations,
pitch=pitch,
energy=energy,
aux_input=aux_input,
)
# use aligner's output as the duration target
if self.use_aligner:
durations = outputs["o_alignment_dur"]
# use float32 in AMP
with autocast(enabled=False):
# compute loss
loss_dict = criterion(
decoder_output=outputs["model_outputs"],
decoder_target=mel_input,
decoder_output_lens=mel_lengths,
dur_output=outputs["durations_log"],
dur_target=durations,
pitch_output=outputs["pitch_avg"] if self.use_pitch else None,
pitch_target=outputs["pitch_avg_gt"] if self.use_pitch else None,
energy_output=outputs["energy_avg"] if self.use_energy else None,
energy_target=outputs["energy_avg_gt"] if self.use_energy else None,
input_lens=text_lengths,
alignment_logprob=outputs["alignment_logprob"] if self.use_aligner else None,
alignment_soft=outputs["alignment_soft"],
alignment_hard=outputs["alignment_mas"],
binary_loss_weight=self.binary_loss_weight,
)
# compute duration error
durations_pred = outputs["durations"]
duration_error = torch.abs(durations - durations_pred).sum() / text_lengths.sum()
loss_dict["duration_error"] = duration_error
return outputs, loss_dict
def _create_logs(self, batch, outputs, ap):
"""Create common logger outputs."""
model_outputs = outputs["model_outputs"]
alignments = outputs["alignments"]
mel_input = batch["mel_input"]
pred_spec = model_outputs[0].data.cpu().numpy()
gt_spec = mel_input[0].data.cpu().numpy()
align_img = alignments[0].data.cpu().numpy()
figures = {
"prediction": plot_spectrogram(pred_spec, ap, output_fig=False),
"ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False),
"alignment": plot_alignment(align_img, output_fig=False),
}
# plot pitch figures
if self.args.use_pitch:
pitch_avg = abs(outputs["pitch_avg_gt"][0, 0].data.cpu().numpy())
pitch_avg_hat = abs(outputs["pitch_avg"][0, 0].data.cpu().numpy())
chars = self.tokenizer.decode(batch["text_input"][0].data.cpu().numpy())
pitch_figures = {
"pitch_ground_truth": plot_avg_pitch(pitch_avg, chars, output_fig=False),
"pitch_avg_predicted": plot_avg_pitch(pitch_avg_hat, chars, output_fig=False),
}
figures.update(pitch_figures)
# plot energy figures
if self.args.use_energy:
energy_avg = abs(outputs["energy_avg_gt"][0, 0].data.cpu().numpy())
energy_avg_hat = abs(outputs["energy_avg"][0, 0].data.cpu().numpy())
chars = self.tokenizer.decode(batch["text_input"][0].data.cpu().numpy())
energy_figures = {
"energy_ground_truth": plot_avg_energy(energy_avg, chars, output_fig=False),
"energy_avg_predicted": plot_avg_energy(energy_avg_hat, chars, output_fig=False),
}
figures.update(energy_figures)
# plot the attention mask computed from the predicted durations
if "attn_durations" in outputs:
alignments_hat = outputs["attn_durations"][0].data.cpu().numpy()
figures["alignment_hat"] = plot_alignment(alignments_hat.T, output_fig=False)
# Sample audio
train_audio = ap.inv_melspectrogram(pred_spec.T)
return figures, {"audio": train_audio}
def train_log(
self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
) -> None: # pylint: disable=no-self-use
figures, audios = self._create_logs(batch, outputs, self.ap)
logger.train_figures(steps, figures)
logger.train_audios(steps, audios, self.ap.sample_rate)
def eval_step(self, batch: dict, criterion: nn.Module):
return self.train_step(batch, criterion)
def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:
figures, audios = self._create_logs(batch, outputs, self.ap)
logger.eval_figures(steps, figures)
logger.eval_audios(steps, audios, self.ap.sample_rate)
def load_checkpoint(
self, config, checkpoint_path, eval=False, cache=False
): # pylint: disable=unused-argument, redefined-builtin
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
self.load_state_dict(state["model"])
if eval:
self.eval()
assert not self.training
def get_criterion(self):
from TTS.tts.layers.losses import ForwardTTSLoss # pylint: disable=import-outside-toplevel
return ForwardTTSLoss(self.config)
def on_train_step_start(self, trainer):
"""Schedule binary loss weight."""
self.binary_loss_weight = min(trainer.epochs_done / self.config.binary_loss_warmup_epochs, 1.0) * 1.0
@staticmethod
def init_from_config(config: "ForwardTTSConfig", samples: Union[List[List], List[Dict]] = None):
"""Initiate model from config
Args:
config (ForwardTTSConfig): Model config.
samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
Defaults to None.
"""
from TTS.utils.audio import AudioProcessor
ap = AudioProcessor.init_from_config(config)
tokenizer, new_config = TTSTokenizer.init_from_config(config)
speaker_manager = SpeakerManager.init_from_config(config, samples)
return ForwardTTS(new_config, ap, tokenizer, speaker_manager)
+557
View File
@@ -0,0 +1,557 @@
import math
from typing import Dict, List, Tuple, Union
import torch
from coqpit import Coqpit
from torch import nn
from torch.cuda.amp.autocast_mode import autocast
from torch.nn import functional as F
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
from TTS.tts.layers.glow_tts.decoder import Decoder
from TTS.tts.layers.glow_tts.encoder import Encoder
from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
from TTS.utils.io import load_fsspec
class GlowTTS(BaseTTS):
"""GlowTTS model.
Paper::
https://arxiv.org/abs/2005.11129
Paper abstract::
Recently, text-to-speech (TTS) models such as FastSpeech and ParaNet have been proposed to generate
mel-spectrograms from text in parallel. Despite the advantage, the parallel TTS models cannot be trained
without guidance from autoregressive TTS models as their external aligners. In this work, we propose Glow-TTS,
a flow-based generative model for parallel TTS that does not require any external aligner. By combining the
properties of flows and dynamic programming, the proposed model searches for the most probable monotonic
alignment between text and the latent representation of speech on its own. We demonstrate that enforcing hard
monotonic alignments enables robust TTS, which generalizes to long utterances, and employing generative flows
enables fast, diverse, and controllable speech synthesis. Glow-TTS obtains an order-of-magnitude speed-up over
the autoregressive model, Tacotron 2, at synthesis with comparable speech quality. We further show that our
model can be easily extended to a multi-speaker setting.
Check :class:`TTS.tts.configs.glow_tts_config.GlowTTSConfig` for class arguments.
Examples:
Init only model layers.
>>> from TTS.tts.configs.glow_tts_config import GlowTTSConfig
>>> from TTS.tts.models.glow_tts import GlowTTS
>>> config = GlowTTSConfig(num_chars=2)
>>> model = GlowTTS(config)
Fully init a model ready for action. All the class attributes and class members
(e.g Tokenizer, AudioProcessor, etc.). are initialized internally based on config values.
>>> from TTS.tts.configs.glow_tts_config import GlowTTSConfig
>>> from TTS.tts.models.glow_tts import GlowTTS
>>> config = GlowTTSConfig()
>>> model = GlowTTS.init_from_config(config, verbose=False)
"""
def __init__(
self,
config: GlowTTSConfig,
ap: "AudioProcessor" = None,
tokenizer: "TTSTokenizer" = None,
speaker_manager: SpeakerManager = None,
):
super().__init__(config, ap, tokenizer, speaker_manager)
# pass all config fields to `self`
# for fewer code change
self.config = config
for key in config:
setattr(self, key, config[key])
self.decoder_output_dim = config.out_channels
# init multi-speaker layers if necessary
self.init_multispeaker(config)
self.run_data_dep_init = config.data_dep_init_steps > 0
self.encoder = Encoder(
self.num_chars,
out_channels=self.out_channels,
hidden_channels=self.hidden_channels_enc,
hidden_channels_dp=self.hidden_channels_dp,
encoder_type=self.encoder_type,
encoder_params=self.encoder_params,
mean_only=self.mean_only,
use_prenet=self.use_encoder_prenet,
dropout_p_dp=self.dropout_p_dp,
c_in_channels=self.c_in_channels,
)
self.decoder = Decoder(
self.out_channels,
self.hidden_channels_dec,
self.kernel_size_dec,
self.dilation_rate,
self.num_flow_blocks_dec,
self.num_block_layers,
dropout_p=self.dropout_p_dec,
num_splits=self.num_splits,
num_squeeze=self.num_squeeze,
sigmoid_scale=self.sigmoid_scale,
c_in_channels=self.c_in_channels,
)
def init_multispeaker(self, config: Coqpit):
"""Init speaker embedding layer if `use_speaker_embedding` is True and set the expected speaker embedding
vector dimension to the encoder layer channel size. If model uses d-vectors, then it only sets
speaker embedding vector dimension to the d-vector dimension from the config.
Args:
config (Coqpit): Model configuration.
"""
self.embedded_speaker_dim = 0
# set number of speakers - if num_speakers is set in config, use it, otherwise use speaker_manager
if self.speaker_manager is not None:
self.num_speakers = self.speaker_manager.num_speakers
# set ultimate speaker embedding size
if config.use_d_vector_file:
self.embedded_speaker_dim = (
config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512
)
if self.speaker_manager is not None:
assert (
config.d_vector_dim == self.speaker_manager.embedding_dim
), " [!] d-vector dimension mismatch b/w config and speaker manager."
# init speaker embedding layer
if config.use_speaker_embedding and not config.use_d_vector_file:
print(" > Init speaker_embedding layer.")
self.embedded_speaker_dim = self.hidden_channels_enc
self.emb_g = nn.Embedding(self.num_speakers, self.hidden_channels_enc)
nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)
# set conditioning dimensions
self.c_in_channels = self.embedded_speaker_dim
@staticmethod
def compute_outputs(attn, o_mean, o_log_scale, x_mask):
"""Compute and format the mode outputs with the given alignment map"""
y_mean = torch.matmul(attn.squeeze(1).transpose(1, 2), o_mean.transpose(1, 2)).transpose(
1, 2
) # [b, t', t], [b, t, d] -> [b, d, t']
y_log_scale = torch.matmul(attn.squeeze(1).transpose(1, 2), o_log_scale.transpose(1, 2)).transpose(
1, 2
) # [b, t', t], [b, t, d] -> [b, d, t']
# compute total duration with adjustment
o_attn_dur = torch.log(1 + torch.sum(attn, -1)) * x_mask
return y_mean, y_log_scale, o_attn_dur
def unlock_act_norm_layers(self):
"""Unlock activation normalization layers for data depended initalization."""
for f in self.decoder.flows:
if getattr(f, "set_ddi", False):
f.set_ddi(True)
def lock_act_norm_layers(self):
"""Lock activation normalization layers."""
for f in self.decoder.flows:
if getattr(f, "set_ddi", False):
f.set_ddi(False)
def _set_speaker_input(self, aux_input: Dict):
if aux_input is None:
d_vectors = None
speaker_ids = None
else:
d_vectors = aux_input.get("d_vectors", None)
speaker_ids = aux_input.get("speaker_ids", None)
if d_vectors is not None and speaker_ids is not None:
raise ValueError("[!] Cannot use d-vectors and speaker-ids together.")
if speaker_ids is not None and not hasattr(self, "emb_g"):
raise ValueError("[!] Cannot use speaker-ids without enabling speaker embedding.")
g = speaker_ids if speaker_ids is not None else d_vectors
return g
def _speaker_embedding(self, aux_input: Dict) -> Union[torch.tensor, None]:
g = self._set_speaker_input(aux_input)
# speaker embedding
if g is not None:
if hasattr(self, "emb_g"):
# use speaker embedding layer
if not g.size(): # if is a scalar
g = g.unsqueeze(0) # unsqueeze
g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1]
else:
# use d-vector
g = F.normalize(g).unsqueeze(-1) # [b, h, 1]
return g
def forward(
self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
): # pylint: disable=dangerous-default-value
"""
Args:
x (torch.Tensor):
Input text sequence ids. :math:`[B, T_en]`
x_lengths (torch.Tensor):
Lengths of input text sequences. :math:`[B]`
y (torch.Tensor):
Target mel-spectrogram frames. :math:`[B, T_de, C_mel]`
y_lengths (torch.Tensor):
Lengths of target mel-spectrogram frames. :math:`[B]`
aux_input (Dict):
Auxiliary inputs. `d_vectors` is speaker embedding vectors for a multi-speaker model.
:math:`[B, D_vec]`. `speaker_ids` is speaker ids for a multi-speaker model usind speaker-embedding
layer. :math:`B`
Returns:
Dict:
- z: :math: `[B, T_de, C]`
- logdet: :math:`B`
- y_mean: :math:`[B, T_de, C]`
- y_log_scale: :math:`[B, T_de, C]`
- alignments: :math:`[B, T_en, T_de]`
- durations_log: :math:`[B, T_en, 1]`
- total_durations_log: :math:`[B, T_en, 1]`
"""
# [B, T, C] -> [B, C, T]
y = y.transpose(1, 2)
y_max_length = y.size(2)
# norm speaker embeddings
g = self._speaker_embedding(aux_input)
# embedding pass
o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g)
# drop redisual frames wrt num_squeeze and set y_lengths.
y, y_lengths, y_max_length, attn = self.preprocess(y, y_lengths, y_max_length, None)
# create masks
y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype)
# [B, 1, T_en, T_de]
attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
# decoder pass
z, logdet = self.decoder(y, y_mask, g=g, reverse=False)
# find the alignment path
with torch.no_grad():
o_scale = torch.exp(-2 * o_log_scale)
logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, [1]).unsqueeze(-1) # [b, t, 1]
logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * (z**2)) # [b, t, d] x [b, d, t'] = [b, t, t']
logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), z) # [b, t, d] x [b, d, t'] = [b, t, t']
logp4 = torch.sum(-0.5 * (o_mean**2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1]
logp = logp1 + logp2 + logp3 + logp4 # [b, t, t']
attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach()
y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask)
attn = attn.squeeze(1).permute(0, 2, 1)
outputs = {
"z": z.transpose(1, 2),
"logdet": logdet,
"y_mean": y_mean.transpose(1, 2),
"y_log_scale": y_log_scale.transpose(1, 2),
"alignments": attn,
"durations_log": o_dur_log.transpose(1, 2),
"total_durations_log": o_attn_dur.transpose(1, 2),
}
return outputs
@torch.no_grad()
def inference_with_MAS(
self, x, x_lengths, y=None, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
): # pylint: disable=dangerous-default-value
"""
It's similar to the teacher forcing in Tacotron.
It was proposed in: https://arxiv.org/abs/2104.05557
Shapes:
- x: :math:`[B, T]`
- x_lenghts: :math:`B`
- y: :math:`[B, T, C]`
- y_lengths: :math:`B`
- g: :math:`[B, C] or B`
"""
y = y.transpose(1, 2)
y_max_length = y.size(2)
# norm speaker embeddings
g = self._speaker_embedding(aux_input)
# embedding pass
o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g)
# drop redisual frames wrt num_squeeze and set y_lengths.
y, y_lengths, y_max_length, attn = self.preprocess(y, y_lengths, y_max_length, None)
# create masks
y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype)
attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
# decoder pass
z, logdet = self.decoder(y, y_mask, g=g, reverse=False)
# find the alignment path between z and encoder output
o_scale = torch.exp(-2 * o_log_scale)
logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, [1]).unsqueeze(-1) # [b, t, 1]
logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * (z**2)) # [b, t, d] x [b, d, t'] = [b, t, t']
logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), z) # [b, t, d] x [b, d, t'] = [b, t, t']
logp4 = torch.sum(-0.5 * (o_mean**2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1]
logp = logp1 + logp2 + logp3 + logp4 # [b, t, t']
attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach()
y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask)
attn = attn.squeeze(1).permute(0, 2, 1)
# get predited aligned distribution
z = y_mean * y_mask
# reverse the decoder and predict using the aligned distribution
y, logdet = self.decoder(z, y_mask, g=g, reverse=True)
outputs = {
"model_outputs": z.transpose(1, 2),
"logdet": logdet,
"y_mean": y_mean.transpose(1, 2),
"y_log_scale": y_log_scale.transpose(1, 2),
"alignments": attn,
"durations_log": o_dur_log.transpose(1, 2),
"total_durations_log": o_attn_dur.transpose(1, 2),
}
return outputs
@torch.no_grad()
def decoder_inference(
self, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
): # pylint: disable=dangerous-default-value
"""
Shapes:
- y: :math:`[B, T, C]`
- y_lengths: :math:`B`
- g: :math:`[B, C] or B`
"""
y = y.transpose(1, 2)
y_max_length = y.size(2)
g = self._speaker_embedding(aux_input)
y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(y.dtype)
# decoder pass
z, logdet = self.decoder(y, y_mask, g=g, reverse=False)
# reverse decoder and predict
y, logdet = self.decoder(z, y_mask, g=g, reverse=True)
outputs = {}
outputs["model_outputs"] = y.transpose(1, 2)
outputs["logdet"] = logdet
return outputs
@torch.no_grad()
def inference(
self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None}
): # pylint: disable=dangerous-default-value
x_lengths = aux_input["x_lengths"]
g = self._speaker_embedding(aux_input)
# embedding pass
o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g)
# compute output durations
w = (torch.exp(o_dur_log) - 1) * x_mask * self.length_scale
w_ceil = torch.clamp_min(torch.ceil(w), 1)
y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
y_max_length = None
# compute masks
y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype)
attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
# compute attention mask
attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1)
y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask)
z = (y_mean + torch.exp(y_log_scale) * torch.randn_like(y_mean) * self.inference_noise_scale) * y_mask
# decoder pass
y, logdet = self.decoder(z, y_mask, g=g, reverse=True)
attn = attn.squeeze(1).permute(0, 2, 1)
outputs = {
"model_outputs": y.transpose(1, 2),
"logdet": logdet,
"y_mean": y_mean.transpose(1, 2),
"y_log_scale": y_log_scale.transpose(1, 2),
"alignments": attn,
"durations_log": o_dur_log.transpose(1, 2),
"total_durations_log": o_attn_dur.transpose(1, 2),
}
return outputs
def train_step(self, batch: dict, criterion: nn.Module):
"""A single training step. Forward pass and loss computation. Run data depended initialization for the
first `config.data_dep_init_steps` steps.
Args:
batch (dict): [description]
criterion (nn.Module): [description]
"""
text_input = batch["text_input"]
text_lengths = batch["text_lengths"]
mel_input = batch["mel_input"]
mel_lengths = batch["mel_lengths"]
d_vectors = batch["d_vectors"]
speaker_ids = batch["speaker_ids"]
if self.run_data_dep_init and self.training:
# compute data-dependent initialization of activation norm layers
self.unlock_act_norm_layers()
with torch.no_grad():
_ = self.forward(
text_input,
text_lengths,
mel_input,
mel_lengths,
aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids},
)
outputs = None
loss_dict = None
self.lock_act_norm_layers()
else:
# normal training step
outputs = self.forward(
text_input,
text_lengths,
mel_input,
mel_lengths,
aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids},
)
with autocast(enabled=False): # avoid mixed_precision in criterion
loss_dict = criterion(
outputs["z"].float(),
outputs["y_mean"].float(),
outputs["y_log_scale"].float(),
outputs["logdet"].float(),
mel_lengths,
outputs["durations_log"].float(),
outputs["total_durations_log"].float(),
text_lengths,
)
return outputs, loss_dict
def _create_logs(self, batch, outputs, ap):
alignments = outputs["alignments"]
text_input = batch["text_input"][:1] if batch["text_input"] is not None else None
text_lengths = batch["text_lengths"]
mel_input = batch["mel_input"]
d_vectors = batch["d_vectors"][:1] if batch["d_vectors"] is not None else None
speaker_ids = batch["speaker_ids"][:1] if batch["speaker_ids"] is not None else None
# model runs reverse flow to predict spectrograms
pred_outputs = self.inference(
text_input,
aux_input={"x_lengths": text_lengths[:1], "d_vectors": d_vectors, "speaker_ids": speaker_ids},
)
model_outputs = pred_outputs["model_outputs"]
pred_spec = model_outputs[0].data.cpu().numpy()
gt_spec = mel_input[0].data.cpu().numpy()
align_img = alignments[0].data.cpu().numpy()
figures = {
"prediction": plot_spectrogram(pred_spec, ap, output_fig=False),
"ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False),
"alignment": plot_alignment(align_img, output_fig=False),
}
# Sample audio
train_audio = ap.inv_melspectrogram(pred_spec.T)
return figures, {"audio": train_audio}
def train_log(
self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
) -> None: # pylint: disable=no-self-use
figures, audios = self._create_logs(batch, outputs, self.ap)
logger.train_figures(steps, figures)
logger.train_audios(steps, audios, self.ap.sample_rate)
@torch.no_grad()
def eval_step(self, batch: dict, criterion: nn.Module):
return self.train_step(batch, criterion)
def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:
figures, audios = self._create_logs(batch, outputs, self.ap)
logger.eval_figures(steps, figures)
logger.eval_audios(steps, audios, self.ap.sample_rate)
@torch.no_grad()
def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
"""Generic test run for `tts` models used by `Trainer`.
You can override this for a different behaviour.
Returns:
Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard.
"""
print(" | > Synthesizing test sentences.")
test_audios = {}
test_figures = {}
test_sentences = self.config.test_sentences
aux_inputs = self._get_test_aux_input()
if len(test_sentences) == 0:
print(" | [!] No test sentences provided.")
else:
for idx, sen in enumerate(test_sentences):
outputs = synthesis(
self,
sen,
self.config,
"cuda" in str(next(self.parameters()).device),
speaker_id=aux_inputs["speaker_id"],
d_vector=aux_inputs["d_vector"],
style_wav=aux_inputs["style_wav"],
use_griffin_lim=True,
do_trim_silence=False,
)
test_audios["{}-audio".format(idx)] = outputs["wav"]
test_figures["{}-prediction".format(idx)] = plot_spectrogram(
outputs["outputs"]["model_outputs"], self.ap, output_fig=False
)
test_figures["{}-alignment".format(idx)] = plot_alignment(outputs["alignments"], output_fig=False)
return test_figures, test_audios
def preprocess(self, y, y_lengths, y_max_length, attn=None):
if y_max_length is not None:
y_max_length = (y_max_length // self.num_squeeze) * self.num_squeeze
y = y[:, :, :y_max_length]
if attn is not None:
attn = attn[:, :, :, :y_max_length]
y_lengths = torch.div(y_lengths, self.num_squeeze, rounding_mode="floor") * self.num_squeeze
return y, y_lengths, y_max_length, attn
def store_inverse(self):
self.decoder.store_inverse()
def load_checkpoint(
self, config, checkpoint_path, eval=False
): # pylint: disable=unused-argument, redefined-builtin
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
self.load_state_dict(state["model"])
if eval:
self.eval()
self.store_inverse()
assert not self.training
@staticmethod
def get_criterion():
from TTS.tts.layers.losses import GlowTTSLoss # pylint: disable=import-outside-toplevel
return GlowTTSLoss()
def on_train_step_start(self, trainer):
"""Decide on every training step wheter enable/disable data depended initialization."""
self.run_data_dep_init = trainer.total_steps_done < self.data_dep_init_steps
@staticmethod
def init_from_config(config: "GlowTTSConfig", samples: Union[List[List], List[Dict]] = None, verbose=True):
"""Initiate model from config
Args:
config (VitsConfig): Model config.
samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
Defaults to None.
verbose (bool): If True, print init messages. Defaults to True.
"""
from TTS.utils.audio import AudioProcessor
ap = AudioProcessor.init_from_config(config, verbose)
tokenizer, new_config = TTSTokenizer.init_from_config(config)
speaker_manager = SpeakerManager.init_from_config(config, samples)
return GlowTTS(new_config, ap, tokenizer, speaker_manager)
+385
View File
@@ -0,0 +1,385 @@
import os
from typing import Dict, List, Union
import torch
from coqpit import Coqpit
from torch import nn
from trainer.logging.tensorboard_logger import TensorboardLogger
from TTS.tts.layers.overflow.common_layers import Encoder, OverflowUtils
from TTS.tts.layers.overflow.neural_hmm import NeuralHMM
from TTS.tts.layers.overflow.plotting_utils import (
get_spec_from_most_probable_state,
plot_transition_probabilities_to_numpy,
)
from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
from TTS.utils.generic_utils import format_aux_input
from TTS.utils.io import load_fsspec
class NeuralhmmTTS(BaseTTS):
"""Neural HMM TTS model.
Paper::
https://arxiv.org/abs/2108.13320
Paper abstract::
Neural sequence-to-sequence TTS has achieved significantly better output quality
than statistical speech synthesis using HMMs.However, neural TTS is generally not probabilistic
and uses non-monotonic attention. Attention failures increase training time and can make
synthesis babble incoherently. This paper describes how the old and new paradigms can be
combined to obtain the advantages of both worlds, by replacing attention in neural TTS with
an autoregressive left-right no-skip hidden Markov model defined by a neural network.
Based on this proposal, we modify Tacotron 2 to obtain an HMM-based neural TTS model with
monotonic alignment, trained to maximise the full sequence likelihood without approximation.
We also describe how to combine ideas from classical and contemporary TTS for best results.
The resulting example system is smaller and simpler than Tacotron 2, and learns to speak with
fewer iterations and less data, whilst achieving comparable naturalness prior to the post-net.
Our approach also allows easy control over speaking rate. Audio examples and code
are available at https://shivammehta25.github.io/Neural-HMM/ .
Note:
- This is a parameter efficient version of OverFlow (15.3M vs 28.6M). Since it has half the
number of parameters as OverFlow the synthesis output quality is suboptimal (but comparable to Tacotron2
without Postnet), but it learns to speak with even lesser amount of data and is still significantly faster
than other attention-based methods.
- Neural HMMs uses flat start initialization i.e it computes the means and std and transition probabilities
of the dataset and uses them to initialize the model. This benefits the model and helps with faster learning
If you change the dataset or want to regenerate the parameters change the `force_generate_statistics` and
`mel_statistics_parameter_path` accordingly.
- To enable multi-GPU training, set the `use_grad_checkpointing=False` in config.
This will significantly increase the memory usage. This is because to compute
the actual data likelihood (not an approximation using MAS/Viterbi) we must use
all the states at the previous time step during the forward pass to decide the
probability distribution at the current step i.e the difference between the forward
algorithm and viterbi approximation.
Check :class:`TTS.tts.configs.neuralhmm_tts_config.NeuralhmmTTSConfig` for class arguments.
"""
def __init__(
self,
config: "NeuralhmmTTSConfig",
ap: "AudioProcessor" = None,
tokenizer: "TTSTokenizer" = None,
speaker_manager: SpeakerManager = None,
):
super().__init__(config, ap, tokenizer, speaker_manager)
# pass all config fields to `self`
# for fewer code change
self.config = config
for key in config:
setattr(self, key, config[key])
self.encoder = Encoder(config.num_chars, config.state_per_phone, config.encoder_in_out_features)
self.neural_hmm = NeuralHMM(
frame_channels=self.out_channels,
ar_order=self.ar_order,
deterministic_transition=self.deterministic_transition,
encoder_dim=self.encoder_in_out_features,
prenet_type=self.prenet_type,
prenet_dim=self.prenet_dim,
prenet_n_layers=self.prenet_n_layers,
prenet_dropout=self.prenet_dropout,
prenet_dropout_at_inference=self.prenet_dropout_at_inference,
memory_rnn_dim=self.memory_rnn_dim,
outputnet_size=self.outputnet_size,
flat_start_params=self.flat_start_params,
std_floor=self.std_floor,
use_grad_checkpointing=self.use_grad_checkpointing,
)
self.register_buffer("mean", torch.tensor(0))
self.register_buffer("std", torch.tensor(1))
def update_mean_std(self, statistics_dict: Dict):
self.mean.data = torch.tensor(statistics_dict["mean"])
self.std.data = torch.tensor(statistics_dict["std"])
def preprocess_batch(self, text, text_len, mels, mel_len):
if self.mean.item() == 0 or self.std.item() == 1:
statistics_dict = torch.load(self.mel_statistics_parameter_path)
self.update_mean_std(statistics_dict)
mels = self.normalize(mels)
return text, text_len, mels, mel_len
def normalize(self, x):
return x.sub(self.mean).div(self.std)
def inverse_normalize(self, x):
return x.mul(self.std).add(self.mean)
def forward(self, text, text_len, mels, mel_len):
"""
Forward pass for training and computing the log likelihood of a given batch.
Shapes:
Shapes:
text: :math:`[B, T_in]`
text_len: :math:`[B]`
mels: :math:`[B, T_out, C]`
mel_len: :math:`[B]`
"""
text, text_len, mels, mel_len = self.preprocess_batch(text, text_len, mels, mel_len)
encoder_outputs, encoder_output_len = self.encoder(text, text_len)
log_probs, fwd_alignments, transition_vectors, means = self.neural_hmm(
encoder_outputs, encoder_output_len, mels.transpose(1, 2), mel_len
)
outputs = {
"log_probs": log_probs,
"alignments": fwd_alignments,
"transition_vectors": transition_vectors,
"means": means,
}
return outputs
@staticmethod
def _training_stats(batch):
stats = {}
stats["avg_text_length"] = batch["text_lengths"].float().mean()
stats["avg_spec_length"] = batch["mel_lengths"].float().mean()
stats["avg_text_batch_occupancy"] = (batch["text_lengths"].float() / batch["text_lengths"].float().max()).mean()
stats["avg_spec_batch_occupancy"] = (batch["mel_lengths"].float() / batch["mel_lengths"].float().max()).mean()
return stats
def train_step(self, batch: dict, criterion: nn.Module):
text_input = batch["text_input"]
text_lengths = batch["text_lengths"]
mel_input = batch["mel_input"]
mel_lengths = batch["mel_lengths"]
outputs = self.forward(
text=text_input,
text_len=text_lengths,
mels=mel_input,
mel_len=mel_lengths,
)
loss_dict = criterion(outputs["log_probs"] / (mel_lengths.sum() + text_lengths.sum()))
# for printing useful statistics on terminal
loss_dict.update(self._training_stats(batch))
return outputs, loss_dict
def eval_step(self, batch: Dict, criterion: nn.Module):
return self.train_step(batch, criterion)
def _format_aux_input(self, aux_input: Dict, default_input_dict):
"""Set missing fields to their default value.
Args:
aux_inputs (Dict): Dictionary containing the auxiliary inputs.
"""
default_input_dict = default_input_dict.copy()
default_input_dict.update(
{
"sampling_temp": self.sampling_temp,
"max_sampling_time": self.max_sampling_time,
"duration_threshold": self.duration_threshold,
}
)
if aux_input:
return format_aux_input(default_input_dict, aux_input)
return default_input_dict
@torch.no_grad()
def inference(
self,
text: torch.Tensor,
aux_input={"x_lengths": None, "sampling_temp": None, "max_sampling_time": None, "duration_threshold": None},
): # pylint: disable=dangerous-default-value
"""Sampling from the model
Args:
text (torch.Tensor): :math:`[B, T_in]`
aux_inputs (_type_, optional): _description_. Defaults to None.
Returns:
outputs: Dictionary containing the following
- mel (torch.Tensor): :math:`[B, T_out, C]`
- hmm_outputs_len (torch.Tensor): :math:`[B]`
- state_travelled (List[List[int]]): List of lists containing the state travelled for each sample in the batch.
- input_parameters (list[torch.FloatTensor]): Input parameters to the neural HMM.
- output_parameters (list[torch.FloatTensor]): Output parameters to the neural HMM.
"""
default_input_dict = {
"x_lengths": torch.sum(text != 0, dim=1),
}
aux_input = self._format_aux_input(aux_input, default_input_dict)
encoder_outputs, encoder_output_len = self.encoder.inference(text, aux_input["x_lengths"])
outputs = self.neural_hmm.inference(
encoder_outputs,
encoder_output_len,
sampling_temp=aux_input["sampling_temp"],
max_sampling_time=aux_input["max_sampling_time"],
duration_threshold=aux_input["duration_threshold"],
)
mels, mel_outputs_len = outputs["hmm_outputs"], outputs["hmm_outputs_len"]
mels = self.inverse_normalize(mels)
outputs.update({"model_outputs": mels, "model_outputs_len": mel_outputs_len})
outputs["alignments"] = OverflowUtils.double_pad(outputs["alignments"])
return outputs
@staticmethod
def get_criterion():
return NLLLoss()
@staticmethod
def init_from_config(config: "NeuralhmmTTSConfig", samples: Union[List[List], List[Dict]] = None, verbose=True):
"""Initiate model from config
Args:
config (VitsConfig): Model config.
samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
Defaults to None.
verbose (bool): If True, print init messages. Defaults to True.
"""
from TTS.utils.audio import AudioProcessor
ap = AudioProcessor.init_from_config(config, verbose)
tokenizer, new_config = TTSTokenizer.init_from_config(config)
speaker_manager = SpeakerManager.init_from_config(config, samples)
return NeuralhmmTTS(new_config, ap, tokenizer, speaker_manager)
def load_checkpoint(
self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True, cache=False
): # pylint: disable=unused-argument, redefined-builtin
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
self.load_state_dict(state["model"])
if eval:
self.eval()
assert not self.training
def on_init_start(self, trainer):
"""If the current dataset does not have normalisation statistics and initialisation transition_probability it computes them otherwise loads."""
if not os.path.isfile(trainer.config.mel_statistics_parameter_path) or trainer.config.force_generate_statistics:
dataloader = trainer.get_train_dataloader(
training_assets=None, samples=trainer.train_samples, verbose=False
)
print(
f" | > Data parameters not found for: {trainer.config.mel_statistics_parameter_path}. Computing mel normalization parameters..."
)
data_mean, data_std, init_transition_prob = OverflowUtils.get_data_parameters_for_flat_start(
dataloader, trainer.config.out_channels, trainer.config.state_per_phone
)
print(
f" | > Saving data parameters to: {trainer.config.mel_statistics_parameter_path}: value: {data_mean, data_std, init_transition_prob}"
)
statistics = {
"mean": data_mean.item(),
"std": data_std.item(),
"init_transition_prob": init_transition_prob.item(),
}
torch.save(statistics, trainer.config.mel_statistics_parameter_path)
else:
print(
f" | > Data parameters found for: {trainer.config.mel_statistics_parameter_path}. Loading mel normalization parameters..."
)
statistics = torch.load(trainer.config.mel_statistics_parameter_path)
data_mean, data_std, init_transition_prob = (
statistics["mean"],
statistics["std"],
statistics["init_transition_prob"],
)
print(f" | > Data parameters loaded with value: {data_mean, data_std, init_transition_prob}")
trainer.config.flat_start_params["transition_p"] = (
init_transition_prob.item() if torch.is_tensor(init_transition_prob) else init_transition_prob
)
OverflowUtils.update_flat_start_transition(trainer.model, init_transition_prob)
trainer.model.update_mean_std(statistics)
@torch.inference_mode()
def _create_logs(self, batch, outputs, ap): # pylint: disable=no-self-use, unused-argument
alignments, transition_vectors = outputs["alignments"], outputs["transition_vectors"]
means = torch.stack(outputs["means"], dim=1)
figures = {
"alignment": plot_alignment(alignments[0].exp(), title="Forward alignment", fig_size=(20, 20)),
"log_alignment": plot_alignment(
alignments[0].exp(), title="Forward log alignment", plot_log=True, fig_size=(20, 20)
),
"transition_vectors": plot_alignment(transition_vectors[0], title="Transition vectors", fig_size=(20, 20)),
"mel_from_most_probable_state": plot_spectrogram(
get_spec_from_most_probable_state(alignments[0], means[0]), fig_size=(12, 3)
),
"mel_target": plot_spectrogram(batch["mel_input"][0], fig_size=(12, 3)),
}
# sample one item from the batch -1 will give the smalles item
print(" | > Synthesising audio from the model...")
inference_output = self.inference(
batch["text_input"][-1].unsqueeze(0), aux_input={"x_lengths": batch["text_lengths"][-1].unsqueeze(0)}
)
figures["synthesised"] = plot_spectrogram(inference_output["model_outputs"][0], fig_size=(12, 3))
states = [p[1] for p in inference_output["input_parameters"][0]]
transition_probability_synthesising = [p[2].cpu().numpy() for p in inference_output["output_parameters"][0]]
for i in range((len(transition_probability_synthesising) // 200) + 1):
start = i * 200
end = (i + 1) * 200
figures[f"synthesised_transition_probabilities/{i}"] = plot_transition_probabilities_to_numpy(
states[start:end], transition_probability_synthesising[start:end]
)
audio = ap.inv_melspectrogram(inference_output["model_outputs"][0].T.cpu().numpy())
return figures, {"audios": audio}
def train_log(
self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
): # pylint: disable=unused-argument
"""Log training progress."""
figures, audios = self._create_logs(batch, outputs, self.ap)
logger.train_figures(steps, figures)
logger.train_audios(steps, audios, self.ap.sample_rate)
def eval_log(
self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int
): # pylint: disable=unused-argument
"""Compute and log evaluation metrics."""
# Plot model parameters histograms
if isinstance(logger, TensorboardLogger):
# I don't know if any other loggers supports this
for tag, value in self.named_parameters():
tag = tag.replace(".", "/")
logger.writer.add_histogram(tag, value.data.cpu().numpy(), steps)
figures, audios = self._create_logs(batch, outputs, self.ap)
logger.eval_figures(steps, figures)
logger.eval_audios(steps, audios, self.ap.sample_rate)
def test_log(
self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument
) -> None:
logger.test_audios(steps, outputs[1], self.ap.sample_rate)
logger.test_figures(steps, outputs[0])
class NLLLoss(nn.Module):
"""Negative log likelihood loss."""
def forward(self, log_prob: torch.Tensor) -> dict: # pylint: disable=no-self-use
"""Compute the loss.
Args:
logits (Tensor): [B, T, D]
Returns:
Tensor: [1]
"""
return_dict = {}
return_dict["loss"] = -log_prob.mean()
return return_dict
+401
View File
@@ -0,0 +1,401 @@
import os
from typing import Dict, List, Union
import torch
from coqpit import Coqpit
from torch import nn
from trainer.logging.tensorboard_logger import TensorboardLogger
from TTS.tts.layers.overflow.common_layers import Encoder, OverflowUtils
from TTS.tts.layers.overflow.decoder import Decoder
from TTS.tts.layers.overflow.neural_hmm import NeuralHMM
from TTS.tts.layers.overflow.plotting_utils import (
get_spec_from_most_probable_state,
plot_transition_probabilities_to_numpy,
)
from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
from TTS.utils.generic_utils import format_aux_input
from TTS.utils.io import load_fsspec
class Overflow(BaseTTS):
"""OverFlow TTS model.
Paper::
https://arxiv.org/abs/2211.06892
Paper abstract::
Neural HMMs are a type of neural transducer recently proposed for
sequence-to-sequence modelling in text-to-speech. They combine the best features
of classic statistical speech synthesis and modern neural TTS, requiring less
data and fewer training updates, and are less prone to gibberish output caused
by neural attention failures. In this paper, we combine neural HMM TTS with
normalising flows for describing the highly non-Gaussian distribution of speech
acoustics. The result is a powerful, fully probabilistic model of durations and
acoustics that can be trained using exact maximum likelihood. Compared to
dominant flow-based acoustic models, our approach integrates autoregression for
improved modelling of long-range dependences such as utterance-level prosody.
Experiments show that a system based on our proposal gives more accurate
pronunciations and better subjective speech quality than comparable methods,
whilst retaining the original advantages of neural HMMs. Audio examples and code
are available at https://shivammehta25.github.io/OverFlow/.
Note:
- Neural HMMs uses flat start initialization i.e it computes the means and std and transition probabilities
of the dataset and uses them to initialize the model. This benefits the model and helps with faster learning
If you change the dataset or want to regenerate the parameters change the `force_generate_statistics` and
`mel_statistics_parameter_path` accordingly.
- To enable multi-GPU training, set the `use_grad_checkpointing=False` in config.
This will significantly increase the memory usage. This is because to compute
the actual data likelihood (not an approximation using MAS/Viterbi) we must use
all the states at the previous time step during the forward pass to decide the
probability distribution at the current step i.e the difference between the forward
algorithm and viterbi approximation.
Check :class:`TTS.tts.configs.overflow.OverFlowConfig` for class arguments.
"""
def __init__(
self,
config: "OverFlowConfig",
ap: "AudioProcessor" = None,
tokenizer: "TTSTokenizer" = None,
speaker_manager: SpeakerManager = None,
):
super().__init__(config, ap, tokenizer, speaker_manager)
# pass all config fields to `self`
# for fewer code change
self.config = config
for key in config:
setattr(self, key, config[key])
self.decoder_output_dim = config.out_channels
self.encoder = Encoder(config.num_chars, config.state_per_phone, config.encoder_in_out_features)
self.neural_hmm = NeuralHMM(
frame_channels=self.out_channels,
ar_order=self.ar_order,
deterministic_transition=self.deterministic_transition,
encoder_dim=self.encoder_in_out_features,
prenet_type=self.prenet_type,
prenet_dim=self.prenet_dim,
prenet_n_layers=self.prenet_n_layers,
prenet_dropout=self.prenet_dropout,
prenet_dropout_at_inference=self.prenet_dropout_at_inference,
memory_rnn_dim=self.memory_rnn_dim,
outputnet_size=self.outputnet_size,
flat_start_params=self.flat_start_params,
std_floor=self.std_floor,
use_grad_checkpointing=self.use_grad_checkpointing,
)
self.decoder = Decoder(
self.out_channels,
self.hidden_channels_dec,
self.kernel_size_dec,
self.dilation_rate,
self.num_flow_blocks_dec,
self.num_block_layers,
dropout_p=self.dropout_p_dec,
num_splits=self.num_splits,
num_squeeze=self.num_squeeze,
sigmoid_scale=self.sigmoid_scale,
c_in_channels=self.c_in_channels,
)
self.register_buffer("mean", torch.tensor(0))
self.register_buffer("std", torch.tensor(1))
def update_mean_std(self, statistics_dict: Dict):
self.mean.data = torch.tensor(statistics_dict["mean"])
self.std.data = torch.tensor(statistics_dict["std"])
def preprocess_batch(self, text, text_len, mels, mel_len):
if self.mean.item() == 0 or self.std.item() == 1:
statistics_dict = torch.load(self.mel_statistics_parameter_path)
self.update_mean_std(statistics_dict)
mels = self.normalize(mels)
return text, text_len, mels, mel_len
def normalize(self, x):
return x.sub(self.mean).div(self.std)
def inverse_normalize(self, x):
return x.mul(self.std).add(self.mean)
def forward(self, text, text_len, mels, mel_len):
"""
Forward pass for training and computing the log likelihood of a given batch.
Shapes:
Shapes:
text: :math:`[B, T_in]`
text_len: :math:`[B]`
mels: :math:`[B, T_out, C]`
mel_len: :math:`[B]`
"""
text, text_len, mels, mel_len = self.preprocess_batch(text, text_len, mels, mel_len)
encoder_outputs, encoder_output_len = self.encoder(text, text_len)
z, z_lengths, logdet = self.decoder(mels.transpose(1, 2), mel_len)
log_probs, fwd_alignments, transition_vectors, means = self.neural_hmm(
encoder_outputs, encoder_output_len, z, z_lengths
)
outputs = {
"log_probs": log_probs + logdet,
"alignments": fwd_alignments,
"transition_vectors": transition_vectors,
"means": means,
}
return outputs
@staticmethod
def _training_stats(batch):
stats = {}
stats["avg_text_length"] = batch["text_lengths"].float().mean()
stats["avg_spec_length"] = batch["mel_lengths"].float().mean()
stats["avg_text_batch_occupancy"] = (batch["text_lengths"].float() / batch["text_lengths"].float().max()).mean()
stats["avg_spec_batch_occupancy"] = (batch["mel_lengths"].float() / batch["mel_lengths"].float().max()).mean()
return stats
def train_step(self, batch: dict, criterion: nn.Module):
text_input = batch["text_input"]
text_lengths = batch["text_lengths"]
mel_input = batch["mel_input"]
mel_lengths = batch["mel_lengths"]
outputs = self.forward(
text=text_input,
text_len=text_lengths,
mels=mel_input,
mel_len=mel_lengths,
)
loss_dict = criterion(outputs["log_probs"] / (mel_lengths.sum() + text_lengths.sum()))
# for printing useful statistics on terminal
loss_dict.update(self._training_stats(batch))
return outputs, loss_dict
def eval_step(self, batch: Dict, criterion: nn.Module):
return self.train_step(batch, criterion)
def _format_aux_input(self, aux_input: Dict, default_input_dict):
"""Set missing fields to their default value.
Args:
aux_inputs (Dict): Dictionary containing the auxiliary inputs.
"""
default_input_dict = default_input_dict.copy()
default_input_dict.update(
{
"sampling_temp": self.sampling_temp,
"max_sampling_time": self.max_sampling_time,
"duration_threshold": self.duration_threshold,
}
)
if aux_input:
return format_aux_input(default_input_dict, aux_input)
return default_input_dict
@torch.no_grad()
def inference(
self,
text: torch.Tensor,
aux_input={"x_lengths": None, "sampling_temp": None, "max_sampling_time": None, "duration_threshold": None},
): # pylint: disable=dangerous-default-value
"""Sampling from the model
Args:
text (torch.Tensor): :math:`[B, T_in]`
aux_inputs (_type_, optional): _description_. Defaults to None.
Returns:
outputs: Dictionary containing the following
- mel (torch.Tensor): :math:`[B, T_out, C]`
- hmm_outputs_len (torch.Tensor): :math:`[B]`
- state_travelled (List[List[int]]): List of lists containing the state travelled for each sample in the batch.
- input_parameters (list[torch.FloatTensor]): Input parameters to the neural HMM.
- output_parameters (list[torch.FloatTensor]): Output parameters to the neural HMM.
"""
default_input_dict = {
"x_lengths": torch.sum(text != 0, dim=1),
}
aux_input = self._format_aux_input(aux_input, default_input_dict)
encoder_outputs, encoder_output_len = self.encoder.inference(text, aux_input["x_lengths"])
outputs = self.neural_hmm.inference(
encoder_outputs,
encoder_output_len,
sampling_temp=aux_input["sampling_temp"],
max_sampling_time=aux_input["max_sampling_time"],
duration_threshold=aux_input["duration_threshold"],
)
mels, mel_outputs_len, _ = self.decoder(
outputs["hmm_outputs"].transpose(1, 2), outputs["hmm_outputs_len"], reverse=True
)
mels = self.inverse_normalize(mels.transpose(1, 2))
outputs.update({"model_outputs": mels, "model_outputs_len": mel_outputs_len})
outputs["alignments"] = OverflowUtils.double_pad(outputs["alignments"])
return outputs
@staticmethod
def get_criterion():
return NLLLoss()
@staticmethod
def init_from_config(config: "OverFlowConfig", samples: Union[List[List], List[Dict]] = None, verbose=True):
"""Initiate model from config
Args:
config (VitsConfig): Model config.
samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
Defaults to None.
verbose (bool): If True, print init messages. Defaults to True.
"""
from TTS.utils.audio import AudioProcessor
ap = AudioProcessor.init_from_config(config, verbose)
tokenizer, new_config = TTSTokenizer.init_from_config(config)
speaker_manager = SpeakerManager.init_from_config(config, samples)
return Overflow(new_config, ap, tokenizer, speaker_manager)
def load_checkpoint(
self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True, cache=False
): # pylint: disable=unused-argument, redefined-builtin
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
self.load_state_dict(state["model"])
if eval:
self.eval()
self.decoder.store_inverse()
assert not self.training
def on_init_start(self, trainer):
"""If the current dataset does not have normalisation statistics and initialisation transition_probability it computes them otherwise loads."""
if not os.path.isfile(trainer.config.mel_statistics_parameter_path) or trainer.config.force_generate_statistics:
dataloader = trainer.get_train_dataloader(
training_assets=None, samples=trainer.train_samples, verbose=False
)
print(
f" | > Data parameters not found for: {trainer.config.mel_statistics_parameter_path}. Computing mel normalization parameters..."
)
data_mean, data_std, init_transition_prob = OverflowUtils.get_data_parameters_for_flat_start(
dataloader, trainer.config.out_channels, trainer.config.state_per_phone
)
print(
f" | > Saving data parameters to: {trainer.config.mel_statistics_parameter_path}: value: {data_mean, data_std, init_transition_prob}"
)
statistics = {
"mean": data_mean.item(),
"std": data_std.item(),
"init_transition_prob": init_transition_prob.item(),
}
torch.save(statistics, trainer.config.mel_statistics_parameter_path)
else:
print(
f" | > Data parameters found for: {trainer.config.mel_statistics_parameter_path}. Loading mel normalization parameters..."
)
statistics = torch.load(trainer.config.mel_statistics_parameter_path)
data_mean, data_std, init_transition_prob = (
statistics["mean"],
statistics["std"],
statistics["init_transition_prob"],
)
print(f" | > Data parameters loaded with value: {data_mean, data_std, init_transition_prob}")
trainer.config.flat_start_params["transition_p"] = (
init_transition_prob.item() if torch.is_tensor(init_transition_prob) else init_transition_prob
)
OverflowUtils.update_flat_start_transition(trainer.model, init_transition_prob)
trainer.model.update_mean_std(statistics)
@torch.inference_mode()
def _create_logs(self, batch, outputs, ap): # pylint: disable=no-self-use, unused-argument
alignments, transition_vectors = outputs["alignments"], outputs["transition_vectors"]
means = torch.stack(outputs["means"], dim=1)
figures = {
"alignment": plot_alignment(alignments[0].exp(), title="Forward alignment", fig_size=(20, 20)),
"log_alignment": plot_alignment(
alignments[0].exp(), title="Forward log alignment", plot_log=True, fig_size=(20, 20)
),
"transition_vectors": plot_alignment(transition_vectors[0], title="Transition vectors", fig_size=(20, 20)),
"mel_from_most_probable_state": plot_spectrogram(
get_spec_from_most_probable_state(alignments[0], means[0], self.decoder), fig_size=(12, 3)
),
"mel_target": plot_spectrogram(batch["mel_input"][0], fig_size=(12, 3)),
}
# sample one item from the batch -1 will give the smalles item
print(" | > Synthesising audio from the model...")
inference_output = self.inference(
batch["text_input"][-1].unsqueeze(0), aux_input={"x_lengths": batch["text_lengths"][-1].unsqueeze(0)}
)
figures["synthesised"] = plot_spectrogram(inference_output["model_outputs"][0], fig_size=(12, 3))
states = [p[1] for p in inference_output["input_parameters"][0]]
transition_probability_synthesising = [p[2].cpu().numpy() for p in inference_output["output_parameters"][0]]
for i in range((len(transition_probability_synthesising) // 200) + 1):
start = i * 200
end = (i + 1) * 200
figures[f"synthesised_transition_probabilities/{i}"] = plot_transition_probabilities_to_numpy(
states[start:end], transition_probability_synthesising[start:end]
)
audio = ap.inv_melspectrogram(inference_output["model_outputs"][0].T.cpu().numpy())
return figures, {"audios": audio}
def train_log(
self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
): # pylint: disable=unused-argument
"""Log training progress."""
figures, audios = self._create_logs(batch, outputs, self.ap)
logger.train_figures(steps, figures)
logger.train_audios(steps, audios, self.ap.sample_rate)
def eval_log(
self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int
): # pylint: disable=unused-argument
"""Compute and log evaluation metrics."""
# Plot model parameters histograms
if isinstance(logger, TensorboardLogger):
# I don't know if any other loggers supports this
for tag, value in self.named_parameters():
tag = tag.replace(".", "/")
logger.writer.add_histogram(tag, value.data.cpu().numpy(), steps)
figures, audios = self._create_logs(batch, outputs, self.ap)
logger.eval_figures(steps, figures)
logger.eval_audios(steps, audios, self.ap.sample_rate)
def test_log(
self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument
) -> None:
logger.test_audios(steps, outputs[1], self.ap.sample_rate)
logger.test_figures(steps, outputs[0])
class NLLLoss(nn.Module):
"""Negative log likelihood loss."""
def forward(self, log_prob: torch.Tensor) -> dict: # pylint: disable=no-self-use
"""Compute the loss.
Args:
logits (Tensor): [B, T, D]
Returns:
Tensor: [1]
"""
return_dict = {}
return_dict["loss"] = -log_prob.mean()
return return_dict
+409
View File
@@ -0,0 +1,409 @@
# coding: utf-8
from typing import Dict, List, Tuple, Union
import torch
from torch import nn
from torch.cuda.amp.autocast_mode import autocast
from trainer.trainer_utils import get_optimizer, get_scheduler
from TTS.tts.layers.tacotron.capacitron_layers import CapacitronVAE
from TTS.tts.layers.tacotron.gst_layers import GST
from TTS.tts.layers.tacotron.tacotron import Decoder, Encoder, PostCBHG
from TTS.tts.models.base_tacotron import BaseTacotron
from TTS.tts.utils.measures import alignment_diagonal_score
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
from TTS.utils.capacitron_optimizer import CapacitronOptimizer
class Tacotron(BaseTacotron):
"""Tacotron as in https://arxiv.org/abs/1703.10135
It's an autoregressive encoder-attention-decoder-postnet architecture.
Check `TacotronConfig` for the arguments.
Args:
config (TacotronConfig): Configuration for the Tacotron model.
speaker_manager (SpeakerManager): Speaker manager to handle multi-speaker settings. Only use if the model is
a multi-speaker model. Defaults to None.
"""
def __init__(
self,
config: "TacotronConfig",
ap: "AudioProcessor" = None,
tokenizer: "TTSTokenizer" = None,
speaker_manager: SpeakerManager = None,
):
super().__init__(config, ap, tokenizer, speaker_manager)
# pass all config fields to `self`
# for fewer code change
for key in config:
setattr(self, key, config[key])
# set speaker embedding channel size for determining `in_channels` for the connected layers.
# `init_multispeaker` needs to be called once more in training to initialize the speaker embedding layer based
# on the number of speakers infered from the dataset.
if self.use_speaker_embedding or self.use_d_vector_file:
self.init_multispeaker(config)
self.decoder_in_features += self.embedded_speaker_dim # add speaker embedding dim
if self.use_gst:
self.decoder_in_features += self.gst.gst_embedding_dim
if self.use_capacitron_vae:
self.decoder_in_features += self.capacitron_vae.capacitron_VAE_embedding_dim
# embedding layer
self.embedding = nn.Embedding(self.num_chars, 256, padding_idx=0)
self.embedding.weight.data.normal_(0, 0.3)
# base model layers
self.encoder = Encoder(self.encoder_in_features)
self.decoder = Decoder(
self.decoder_in_features,
self.decoder_output_dim,
self.r,
self.memory_size,
self.attention_type,
self.windowing,
self.attention_norm,
self.prenet_type,
self.prenet_dropout,
self.use_forward_attn,
self.transition_agent,
self.forward_attn_mask,
self.location_attn,
self.attention_heads,
self.separate_stopnet,
self.max_decoder_steps,
)
self.postnet = PostCBHG(self.decoder_output_dim)
self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, self.out_channels)
# setup prenet dropout
self.decoder.prenet.dropout_at_inference = self.prenet_dropout_at_inference
# global style token layers
if self.gst and self.use_gst:
self.gst_layer = GST(
num_mel=self.decoder_output_dim,
num_heads=self.gst.gst_num_heads,
num_style_tokens=self.gst.gst_num_style_tokens,
gst_embedding_dim=self.gst.gst_embedding_dim,
)
# Capacitron layers
if self.capacitron_vae and self.use_capacitron_vae:
self.capacitron_vae_layer = CapacitronVAE(
num_mel=self.decoder_output_dim,
encoder_output_dim=self.encoder_in_features,
capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim,
speaker_embedding_dim=self.embedded_speaker_dim
if self.use_speaker_embedding and self.capacitron_vae.capacitron_use_speaker_embedding
else None,
text_summary_embedding_dim=self.capacitron_vae.capacitron_text_summary_embedding_dim
if self.capacitron_vae.capacitron_use_text_summary_embeddings
else None,
)
# backward pass decoder
if self.bidirectional_decoder:
self._init_backward_decoder()
# setup DDC
if self.double_decoder_consistency:
self.coarse_decoder = Decoder(
self.decoder_in_features,
self.decoder_output_dim,
self.ddc_r,
self.memory_size,
self.attention_type,
self.windowing,
self.attention_norm,
self.prenet_type,
self.prenet_dropout,
self.use_forward_attn,
self.transition_agent,
self.forward_attn_mask,
self.location_attn,
self.attention_heads,
self.separate_stopnet,
self.max_decoder_steps,
)
def forward( # pylint: disable=dangerous-default-value
self, text, text_lengths, mel_specs=None, mel_lengths=None, aux_input={"speaker_ids": None, "d_vectors": None}
):
"""
Shapes:
text: [B, T_in]
text_lengths: [B]
mel_specs: [B, T_out, C]
mel_lengths: [B]
aux_input: 'speaker_ids': [B, 1] and 'd_vectors':[B, C]
"""
aux_input = self._format_aux_input(aux_input)
outputs = {"alignments_backward": None, "decoder_outputs_backward": None}
inputs = self.embedding(text)
input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
# B x T_in x encoder_in_features
encoder_outputs = self.encoder(inputs)
# sequence masking
encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)
# global style token
if self.gst and self.use_gst:
# B x gst_dim
encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
# speaker embedding
if self.use_speaker_embedding or self.use_d_vector_file:
if not self.use_d_vector_file:
# B x 1 x speaker_embed_dim
embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"])[:, None]
else:
# B x 1 x speaker_embed_dim
embedded_speakers = torch.unsqueeze(aux_input["d_vectors"], 1)
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers)
# Capacitron
if self.capacitron_vae and self.use_capacitron_vae:
# B x capacitron_VAE_embedding_dim
encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding(
encoder_outputs,
reference_mel_info=[mel_specs, mel_lengths],
text_info=[inputs, text_lengths]
if self.capacitron_vae.capacitron_use_text_summary_embeddings
else None,
speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None,
)
else:
capacitron_vae_outputs = None
# decoder_outputs: B x decoder_in_features x T_out
# alignments: B x T_in x encoder_in_features
# stop_tokens: B x T_in
decoder_outputs, alignments, stop_tokens = self.decoder(encoder_outputs, mel_specs, input_mask)
# sequence masking
if output_mask is not None:
decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs)
# B x T_out x decoder_in_features
postnet_outputs = self.postnet(decoder_outputs)
# sequence masking
if output_mask is not None:
postnet_outputs = postnet_outputs * output_mask.unsqueeze(2).expand_as(postnet_outputs)
# B x T_out x posnet_dim
postnet_outputs = self.last_linear(postnet_outputs)
# B x T_out x decoder_in_features
decoder_outputs = decoder_outputs.transpose(1, 2).contiguous()
if self.bidirectional_decoder:
decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask)
outputs["alignments_backward"] = alignments_backward
outputs["decoder_outputs_backward"] = decoder_outputs_backward
if self.double_decoder_consistency:
decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass(
mel_specs, encoder_outputs, alignments, input_mask
)
outputs["alignments_backward"] = alignments_backward
outputs["decoder_outputs_backward"] = decoder_outputs_backward
outputs.update(
{
"model_outputs": postnet_outputs,
"decoder_outputs": decoder_outputs,
"alignments": alignments,
"stop_tokens": stop_tokens,
"capacitron_vae_outputs": capacitron_vae_outputs,
}
)
return outputs
@torch.no_grad()
def inference(self, text_input, aux_input=None):
aux_input = self._format_aux_input(aux_input)
inputs = self.embedding(text_input)
encoder_outputs = self.encoder(inputs)
if self.gst and self.use_gst:
# B x gst_dim
encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_mel"], aux_input["d_vectors"])
if self.capacitron_vae and self.use_capacitron_vae:
if aux_input["style_text"] is not None:
style_text_embedding = self.embedding(aux_input["style_text"])
style_text_length = torch.tensor([style_text_embedding.size(1)], dtype=torch.int64).to(
encoder_outputs.device
) # pylint: disable=not-callable
reference_mel_length = (
torch.tensor([aux_input["style_mel"].size(1)], dtype=torch.int64).to(encoder_outputs.device)
if aux_input["style_mel"] is not None
else None
) # pylint: disable=not-callable
# B x capacitron_VAE_embedding_dim
encoder_outputs, *_ = self.compute_capacitron_VAE_embedding(
encoder_outputs,
reference_mel_info=[aux_input["style_mel"], reference_mel_length]
if aux_input["style_mel"] is not None
else None,
text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None,
speaker_embedding=aux_input["d_vectors"]
if self.capacitron_vae.capacitron_use_speaker_embedding
else None,
)
if self.num_speakers > 1:
if not self.use_d_vector_file:
# B x 1 x speaker_embed_dim
embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"])
# reshape embedded_speakers
if embedded_speakers.ndim == 1:
embedded_speakers = embedded_speakers[None, None, :]
elif embedded_speakers.ndim == 2:
embedded_speakers = embedded_speakers[None, :]
else:
# B x 1 x speaker_embed_dim
embedded_speakers = torch.unsqueeze(aux_input["d_vectors"], 1)
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers)
decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs)
postnet_outputs = self.postnet(decoder_outputs)
postnet_outputs = self.last_linear(postnet_outputs)
decoder_outputs = decoder_outputs.transpose(1, 2)
outputs = {
"model_outputs": postnet_outputs,
"decoder_outputs": decoder_outputs,
"alignments": alignments,
"stop_tokens": stop_tokens,
}
return outputs
def before_backward_pass(self, loss_dict, optimizer) -> None:
# Extracting custom training specific operations for capacitron
# from the trainer
if self.use_capacitron_vae:
loss_dict["capacitron_vae_beta_loss"].backward()
optimizer.first_step()
def train_step(self, batch: Dict, criterion: torch.nn.Module) -> Tuple[Dict, Dict]:
"""Perform a single training step by fetching the right set of samples from the batch.
Args:
batch ([Dict]): A dictionary of input tensors.
criterion ([torch.nn.Module]): Callable criterion to compute model loss.
"""
text_input = batch["text_input"]
text_lengths = batch["text_lengths"]
mel_input = batch["mel_input"]
mel_lengths = batch["mel_lengths"]
linear_input = batch["linear_input"]
stop_targets = batch["stop_targets"]
stop_target_lengths = batch["stop_target_lengths"]
speaker_ids = batch["speaker_ids"]
d_vectors = batch["d_vectors"]
aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)
# set the [alignment] lengths wrt reduction factor for guided attention
if mel_lengths.max() % self.decoder.r != 0:
alignment_lengths = (
mel_lengths + (self.decoder.r - (mel_lengths.max() % self.decoder.r))
) // self.decoder.r
else:
alignment_lengths = mel_lengths // self.decoder.r
# compute loss
with autocast(enabled=False): # use float32 for the criterion
loss_dict = criterion(
outputs["model_outputs"].float(),
outputs["decoder_outputs"].float(),
mel_input.float(),
linear_input.float(),
outputs["stop_tokens"].float(),
stop_targets.float(),
stop_target_lengths,
outputs["capacitron_vae_outputs"] if self.capacitron_vae else None,
mel_lengths,
None if outputs["decoder_outputs_backward"] is None else outputs["decoder_outputs_backward"].float(),
outputs["alignments"].float(),
alignment_lengths,
None if outputs["alignments_backward"] is None else outputs["alignments_backward"].float(),
text_lengths,
)
# compute alignment error (the lower the better )
align_error = 1 - alignment_diagonal_score(outputs["alignments"])
loss_dict["align_error"] = align_error
return outputs, loss_dict
def get_optimizer(self) -> List:
if self.use_capacitron_vae:
return CapacitronOptimizer(self.config, self.named_parameters())
return get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr, self)
def get_scheduler(self, optimizer: object):
opt = optimizer.primary_optimizer if self.use_capacitron_vae else optimizer
return get_scheduler(self.config.lr_scheduler, self.config.lr_scheduler_params, opt)
def before_gradient_clipping(self):
if self.use_capacitron_vae:
# Capacitron model specific gradient clipping
model_params_to_clip = []
for name, param in self.named_parameters():
if param.requires_grad:
if name != "capacitron_vae_layer.beta":
model_params_to_clip.append(param)
torch.nn.utils.clip_grad_norm_(model_params_to_clip, self.capacitron_vae.capacitron_grad_clip)
def _create_logs(self, batch, outputs, ap):
postnet_outputs = outputs["model_outputs"]
decoder_outputs = outputs["decoder_outputs"]
alignments = outputs["alignments"]
alignments_backward = outputs["alignments_backward"]
mel_input = batch["mel_input"]
linear_input = batch["linear_input"]
pred_linear_spec = postnet_outputs[0].data.cpu().numpy()
pred_mel_spec = decoder_outputs[0].data.cpu().numpy()
gt_linear_spec = linear_input[0].data.cpu().numpy()
gt_mel_spec = mel_input[0].data.cpu().numpy()
align_img = alignments[0].data.cpu().numpy()
figures = {
"pred_linear_spec": plot_spectrogram(pred_linear_spec, ap, output_fig=False),
"real_linear_spec": plot_spectrogram(gt_linear_spec, ap, output_fig=False),
"pred_mel_spec": plot_spectrogram(pred_mel_spec, ap, output_fig=False),
"real_mel_spec": plot_spectrogram(gt_mel_spec, ap, output_fig=False),
"alignment": plot_alignment(align_img, output_fig=False),
}
if self.bidirectional_decoder or self.double_decoder_consistency:
figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy(), output_fig=False)
# Sample audio
audio = ap.inv_spectrogram(pred_linear_spec.T)
return figures, {"audio": audio}
def train_log(
self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
) -> None: # pylint: disable=no-self-use
figures, audios = self._create_logs(batch, outputs, self.ap)
logger.train_figures(steps, figures)
logger.train_audios(steps, audios, self.ap.sample_rate)
def eval_step(self, batch: dict, criterion: nn.Module):
return self.train_step(batch, criterion)
def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:
figures, audios = self._create_logs(batch, outputs, self.ap)
logger.eval_figures(steps, figures)
logger.eval_audios(steps, audios, self.ap.sample_rate)
@staticmethod
def init_from_config(config: "TacotronConfig", samples: Union[List[List], List[Dict]] = None):
"""Initiate model from config
Args:
config (TacotronConfig): Model config.
samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
Defaults to None.
"""
from TTS.utils.audio import AudioProcessor
ap = AudioProcessor.init_from_config(config)
tokenizer, new_config = TTSTokenizer.init_from_config(config)
speaker_manager = SpeakerManager.init_from_config(config, samples)
return Tacotron(new_config, ap, tokenizer, speaker_manager)
+433
View File
@@ -0,0 +1,433 @@
# coding: utf-8
from typing import Dict, List, Union
import torch
from torch import nn
from torch.cuda.amp.autocast_mode import autocast
from trainer.trainer_utils import get_optimizer, get_scheduler
from TTS.tts.layers.tacotron.capacitron_layers import CapacitronVAE
from TTS.tts.layers.tacotron.gst_layers import GST
from TTS.tts.layers.tacotron.tacotron2 import Decoder, Encoder, Postnet
from TTS.tts.models.base_tacotron import BaseTacotron
from TTS.tts.utils.measures import alignment_diagonal_score
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
from TTS.utils.capacitron_optimizer import CapacitronOptimizer
class Tacotron2(BaseTacotron):
"""Tacotron2 model implementation inherited from :class:`TTS.tts.models.base_tacotron.BaseTacotron`.
Paper::
https://arxiv.org/abs/1712.05884
Paper abstract::
This paper describes Tacotron 2, a neural network architecture for speech synthesis directly from text.
The system is composed of a recurrent sequence-to-sequence feature prediction network that maps character
embeddings to mel-scale spectrograms, followed by a modified WaveNet model acting as a vocoder to synthesize
timedomain waveforms from those spectrograms. Our model achieves a mean opinion score (MOS) of 4.53 comparable
to a MOS of 4.58 for professionally recorded speech. To validate our design choices, we present ablation
studies of key components of our system and evaluate the impact of using mel spectrograms as the input to
WaveNet instead of linguistic, duration, and F0 features. We further demonstrate that using a compact acoustic
intermediate representation enables significant simplification of the WaveNet architecture.
Check :class:`TTS.tts.configs.tacotron2_config.Tacotron2Config` for model arguments.
Args:
config (TacotronConfig):
Configuration for the Tacotron2 model.
speaker_manager (SpeakerManager):
Speaker manager for multi-speaker training. Uuse only for multi-speaker training. Defaults to None.
"""
def __init__(
self,
config: "Tacotron2Config",
ap: "AudioProcessor" = None,
tokenizer: "TTSTokenizer" = None,
speaker_manager: SpeakerManager = None,
):
super().__init__(config, ap, tokenizer, speaker_manager)
self.decoder_output_dim = config.out_channels
# pass all config fields to `self`
# for fewer code change
for key in config:
setattr(self, key, config[key])
# init multi-speaker layers
if self.use_speaker_embedding or self.use_d_vector_file:
self.init_multispeaker(config)
self.decoder_in_features += self.embedded_speaker_dim # add speaker embedding dim
if self.use_gst:
self.decoder_in_features += self.gst.gst_embedding_dim
if self.use_capacitron_vae:
self.decoder_in_features += self.capacitron_vae.capacitron_VAE_embedding_dim
# embedding layer
self.embedding = nn.Embedding(self.num_chars, 512, padding_idx=0)
# base model layers
self.encoder = Encoder(self.encoder_in_features)
self.decoder = Decoder(
self.decoder_in_features,
self.decoder_output_dim,
self.r,
self.attention_type,
self.attention_win,
self.attention_norm,
self.prenet_type,
self.prenet_dropout,
self.use_forward_attn,
self.transition_agent,
self.forward_attn_mask,
self.location_attn,
self.attention_heads,
self.separate_stopnet,
self.max_decoder_steps,
)
self.postnet = Postnet(self.out_channels)
# setup prenet dropout
self.decoder.prenet.dropout_at_inference = self.prenet_dropout_at_inference
# global style token layers
if self.gst and self.use_gst:
self.gst_layer = GST(
num_mel=self.decoder_output_dim,
num_heads=self.gst.gst_num_heads,
num_style_tokens=self.gst.gst_num_style_tokens,
gst_embedding_dim=self.gst.gst_embedding_dim,
)
# Capacitron VAE Layers
if self.capacitron_vae and self.use_capacitron_vae:
self.capacitron_vae_layer = CapacitronVAE(
num_mel=self.decoder_output_dim,
encoder_output_dim=self.encoder_in_features,
capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim,
speaker_embedding_dim=self.embedded_speaker_dim
if self.capacitron_vae.capacitron_use_speaker_embedding
else None,
text_summary_embedding_dim=self.capacitron_vae.capacitron_text_summary_embedding_dim
if self.capacitron_vae.capacitron_use_text_summary_embeddings
else None,
)
# backward pass decoder
if self.bidirectional_decoder:
self._init_backward_decoder()
# setup DDC
if self.double_decoder_consistency:
self.coarse_decoder = Decoder(
self.decoder_in_features,
self.decoder_output_dim,
self.ddc_r,
self.attention_type,
self.attention_win,
self.attention_norm,
self.prenet_type,
self.prenet_dropout,
self.use_forward_attn,
self.transition_agent,
self.forward_attn_mask,
self.location_attn,
self.attention_heads,
self.separate_stopnet,
self.max_decoder_steps,
)
@staticmethod
def shape_outputs(mel_outputs, mel_outputs_postnet, alignments):
"""Final reshape of the model output tensors."""
mel_outputs = mel_outputs.transpose(1, 2)
mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2)
return mel_outputs, mel_outputs_postnet, alignments
def forward( # pylint: disable=dangerous-default-value
self, text, text_lengths, mel_specs=None, mel_lengths=None, aux_input={"speaker_ids": None, "d_vectors": None}
):
"""Forward pass for training with Teacher Forcing.
Shapes:
text: :math:`[B, T_in]`
text_lengths: :math:`[B]`
mel_specs: :math:`[B, T_out, C]`
mel_lengths: :math:`[B]`
aux_input: 'speaker_ids': :math:`[B, 1]` and 'd_vectors': :math:`[B, C]`
"""
aux_input = self._format_aux_input(aux_input)
outputs = {"alignments_backward": None, "decoder_outputs_backward": None}
# compute mask for padding
# B x T_in_max (boolean)
input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
# B x D_embed x T_in_max
embedded_inputs = self.embedding(text).transpose(1, 2)
# B x T_in_max x D_en
encoder_outputs = self.encoder(embedded_inputs, text_lengths)
if self.gst and self.use_gst:
# B x gst_dim
encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
if self.use_speaker_embedding or self.use_d_vector_file:
if not self.use_d_vector_file:
# B x 1 x speaker_embed_dim
embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"])[:, None]
else:
# B x 1 x speaker_embed_dim
embedded_speakers = torch.unsqueeze(aux_input["d_vectors"], 1)
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers)
# capacitron
if self.capacitron_vae and self.use_capacitron_vae:
# B x capacitron_VAE_embedding_dim
encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding(
encoder_outputs,
reference_mel_info=[mel_specs, mel_lengths],
text_info=[embedded_inputs.transpose(1, 2), text_lengths]
if self.capacitron_vae.capacitron_use_text_summary_embeddings
else None,
speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None,
)
else:
capacitron_vae_outputs = None
encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)
# B x mel_dim x T_out -- B x T_out//r x T_in -- B x T_out//r
decoder_outputs, alignments, stop_tokens = self.decoder(encoder_outputs, mel_specs, input_mask)
# sequence masking
if mel_lengths is not None:
decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs)
# B x mel_dim x T_out
postnet_outputs = self.postnet(decoder_outputs)
postnet_outputs = decoder_outputs + postnet_outputs
# sequence masking
if output_mask is not None:
postnet_outputs = postnet_outputs * output_mask.unsqueeze(1).expand_as(postnet_outputs)
# B x T_out x mel_dim -- B x T_out x mel_dim -- B x T_out//r x T_in
decoder_outputs, postnet_outputs, alignments = self.shape_outputs(decoder_outputs, postnet_outputs, alignments)
if self.bidirectional_decoder:
decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask)
outputs["alignments_backward"] = alignments_backward
outputs["decoder_outputs_backward"] = decoder_outputs_backward
if self.double_decoder_consistency:
decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass(
mel_specs, encoder_outputs, alignments, input_mask
)
outputs["alignments_backward"] = alignments_backward
outputs["decoder_outputs_backward"] = decoder_outputs_backward
outputs.update(
{
"model_outputs": postnet_outputs,
"decoder_outputs": decoder_outputs,
"alignments": alignments,
"stop_tokens": stop_tokens,
"capacitron_vae_outputs": capacitron_vae_outputs,
}
)
return outputs
@torch.no_grad()
def inference(self, text, aux_input=None):
"""Forward pass for inference with no Teacher-Forcing.
Shapes:
text: :math:`[B, T_in]`
text_lengths: :math:`[B]`
"""
aux_input = self._format_aux_input(aux_input)
embedded_inputs = self.embedding(text).transpose(1, 2)
encoder_outputs = self.encoder.inference(embedded_inputs)
if self.gst and self.use_gst:
# B x gst_dim
encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_mel"], aux_input["d_vectors"])
if self.capacitron_vae and self.use_capacitron_vae:
if aux_input["style_text"] is not None:
style_text_embedding = self.embedding(aux_input["style_text"])
style_text_length = torch.tensor([style_text_embedding.size(1)], dtype=torch.int64).to(
encoder_outputs.device
) # pylint: disable=not-callable
reference_mel_length = (
torch.tensor([aux_input["style_mel"].size(1)], dtype=torch.int64).to(encoder_outputs.device)
if aux_input["style_mel"] is not None
else None
) # pylint: disable=not-callable
# B x capacitron_VAE_embedding_dim
encoder_outputs, *_ = self.compute_capacitron_VAE_embedding(
encoder_outputs,
reference_mel_info=[aux_input["style_mel"], reference_mel_length]
if aux_input["style_mel"] is not None
else None,
text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None,
speaker_embedding=aux_input["d_vectors"]
if self.capacitron_vae.capacitron_use_speaker_embedding
else None,
)
if self.num_speakers > 1:
if not self.use_d_vector_file:
embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"])[None]
# reshape embedded_speakers
if embedded_speakers.ndim == 1:
embedded_speakers = embedded_speakers[None, None, :]
elif embedded_speakers.ndim == 2:
embedded_speakers = embedded_speakers[None, :]
else:
embedded_speakers = aux_input["d_vectors"]
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers)
decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs)
postnet_outputs = self.postnet(decoder_outputs)
postnet_outputs = decoder_outputs + postnet_outputs
decoder_outputs, postnet_outputs, alignments = self.shape_outputs(decoder_outputs, postnet_outputs, alignments)
outputs = {
"model_outputs": postnet_outputs,
"decoder_outputs": decoder_outputs,
"alignments": alignments,
"stop_tokens": stop_tokens,
}
return outputs
def before_backward_pass(self, loss_dict, optimizer) -> None:
# Extracting custom training specific operations for capacitron
# from the trainer
if self.use_capacitron_vae:
loss_dict["capacitron_vae_beta_loss"].backward()
optimizer.first_step()
def train_step(self, batch: Dict, criterion: torch.nn.Module):
"""A single training step. Forward pass and loss computation.
Args:
batch ([Dict]): A dictionary of input tensors.
criterion ([type]): Callable criterion to compute model loss.
"""
text_input = batch["text_input"]
text_lengths = batch["text_lengths"]
mel_input = batch["mel_input"]
mel_lengths = batch["mel_lengths"]
stop_targets = batch["stop_targets"]
stop_target_lengths = batch["stop_target_lengths"]
speaker_ids = batch["speaker_ids"]
d_vectors = batch["d_vectors"]
aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)
# set the [alignment] lengths wrt reduction factor for guided attention
if mel_lengths.max() % self.decoder.r != 0:
alignment_lengths = (
mel_lengths + (self.decoder.r - (mel_lengths.max() % self.decoder.r))
) // self.decoder.r
else:
alignment_lengths = mel_lengths // self.decoder.r
# compute loss
with autocast(enabled=False): # use float32 for the criterion
loss_dict = criterion(
outputs["model_outputs"].float(),
outputs["decoder_outputs"].float(),
mel_input.float(),
None,
outputs["stop_tokens"].float(),
stop_targets.float(),
stop_target_lengths,
outputs["capacitron_vae_outputs"] if self.capacitron_vae else None,
mel_lengths,
None if outputs["decoder_outputs_backward"] is None else outputs["decoder_outputs_backward"].float(),
outputs["alignments"].float(),
alignment_lengths,
None if outputs["alignments_backward"] is None else outputs["alignments_backward"].float(),
text_lengths,
)
# compute alignment error (the lower the better )
align_error = 1 - alignment_diagonal_score(outputs["alignments"])
loss_dict["align_error"] = align_error
return outputs, loss_dict
def get_optimizer(self) -> List:
if self.use_capacitron_vae:
return CapacitronOptimizer(self.config, self.named_parameters())
return get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr, self)
def get_scheduler(self, optimizer: object):
opt = optimizer.primary_optimizer if self.use_capacitron_vae else optimizer
return get_scheduler(self.config.lr_scheduler, self.config.lr_scheduler_params, opt)
def before_gradient_clipping(self):
if self.use_capacitron_vae:
# Capacitron model specific gradient clipping
model_params_to_clip = []
for name, param in self.named_parameters():
if param.requires_grad:
if name != "capacitron_vae_layer.beta":
model_params_to_clip.append(param)
torch.nn.utils.clip_grad_norm_(model_params_to_clip, self.capacitron_vae.capacitron_grad_clip)
def _create_logs(self, batch, outputs, ap):
"""Create dashboard log information."""
postnet_outputs = outputs["model_outputs"]
alignments = outputs["alignments"]
alignments_backward = outputs["alignments_backward"]
mel_input = batch["mel_input"]
pred_spec = postnet_outputs[0].data.cpu().numpy()
gt_spec = mel_input[0].data.cpu().numpy()
align_img = alignments[0].data.cpu().numpy()
figures = {
"prediction": plot_spectrogram(pred_spec, ap, output_fig=False),
"ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False),
"alignment": plot_alignment(align_img, output_fig=False),
}
if self.bidirectional_decoder or self.double_decoder_consistency:
figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy(), output_fig=False)
# Sample audio
audio = ap.inv_melspectrogram(pred_spec.T)
return figures, {"audio": audio}
def train_log(
self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
) -> None: # pylint: disable=no-self-use
"""Log training progress."""
figures, audios = self._create_logs(batch, outputs, self.ap)
logger.train_figures(steps, figures)
logger.train_audios(steps, audios, self.ap.sample_rate)
def eval_step(self, batch: dict, criterion: nn.Module):
return self.train_step(batch, criterion)
def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:
figures, audios = self._create_logs(batch, outputs, self.ap)
logger.eval_figures(steps, figures)
logger.eval_audios(steps, audios, self.ap.sample_rate)
@staticmethod
def init_from_config(config: "Tacotron2Config", samples: Union[List[List], List[Dict]] = None):
"""Initiate model from config
Args:
config (Tacotron2Config): Model config.
samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
Defaults to None.
"""
from TTS.utils.audio import AudioProcessor
ap = AudioProcessor.init_from_config(config)
tokenizer, new_config = TTSTokenizer.init_from_config(config)
speaker_manager = SpeakerManager.init_from_config(new_config, samples)
return Tacotron2(new_config, ap, tokenizer, speaker_manager)
+911
View File
@@ -0,0 +1,911 @@
import os
import random
from contextlib import contextmanager
from dataclasses import dataclass
from time import time
import torch
import torch.nn.functional as F
import torchaudio
from coqpit import Coqpit
from tqdm import tqdm
from TTS.tts.layers.tortoise.arch_utils import TorchMelSpectrogram
from TTS.tts.layers.tortoise.audio_utils import denormalize_tacotron_mel, load_voice, wav_to_univnet_mel
from TTS.tts.layers.tortoise.autoregressive import UnifiedVoice
from TTS.tts.layers.tortoise.classifier import AudioMiniEncoderWithClassifierHead
from TTS.tts.layers.tortoise.clvp import CLVP
from TTS.tts.layers.tortoise.diffusion import SpacedDiffusion, get_named_beta_schedule, space_timesteps
from TTS.tts.layers.tortoise.diffusion_decoder import DiffusionTts
from TTS.tts.layers.tortoise.random_latent_generator import RandomLatentConverter
from TTS.tts.layers.tortoise.tokenizer import VoiceBpeTokenizer
from TTS.tts.layers.tortoise.vocoder import VocConf, VocType
from TTS.tts.layers.tortoise.wav2vec_alignment import Wav2VecAlignment
from TTS.tts.models.base_tts import BaseTTS
def pad_or_truncate(t, length):
"""
Utility function for forcing <t> to have the specified sequence length, whether by clipping it or padding it with 0s.
"""
tp = t[..., :length]
if t.shape[-1] == length:
tp = t
elif t.shape[-1] < length:
tp = F.pad(t, (0, length - t.shape[-1]))
return tp
def deterministic_state(seed=None):
"""
Sets the random seeds that tortoise uses to the current time() and returns that seed so results can be
reproduced.
"""
seed = int(time()) if seed is None else seed
torch.manual_seed(seed)
random.seed(seed)
# Can't currently set this because of CUBLAS. TODO: potentially enable it if necessary.
# torch.use_deterministic_algorithms(True)
return seed
def load_discrete_vocoder_diffuser(
trained_diffusion_steps=4000,
desired_diffusion_steps=200,
cond_free=True,
cond_free_k=1,
sampler="ddim",
):
"""
Helper function to load a GaussianDiffusion instance configured for use as a vocoder.
"""
return SpacedDiffusion(
use_timesteps=space_timesteps(trained_diffusion_steps, [desired_diffusion_steps]),
model_mean_type="epsilon",
model_var_type="learned_range",
loss_type="mse",
betas=get_named_beta_schedule("linear", trained_diffusion_steps),
conditioning_free=cond_free,
conditioning_free_k=cond_free_k,
sampler=sampler,
)
def format_conditioning(clip, cond_length=132300, device="cuda", **kwargs):
"""
Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models.
"""
gap = clip.shape[-1] - cond_length
if gap < 0:
clip = F.pad(clip, pad=(0, abs(gap)))
elif gap > 0:
rand_start = random.randint(0, gap)
clip = clip[:, rand_start : rand_start + cond_length]
mel_clip = TorchMelSpectrogram(**kwargs)(clip.unsqueeze(0)).squeeze(0)
return mel_clip.unsqueeze(0).to(device)
def fix_autoregressive_output(codes, stop_token, complain=True):
"""
This function performs some padding on coded audio that fixes a mismatch issue between what the diffusion model was
trained on and what the autoregressive code generator creates (which has no padding or end).
This is highly specific to the DVAE being used, so this particular coding will not necessarily work if used with
a different DVAE. This can be inferred by feeding a audio clip padded with lots of zeros on the end through the DVAE
and copying out the last few codes.
Failing to do this padding will produce speech with a harsh end that sounds like "BLAH" or similar.
"""
# Strip off the autoregressive stop token and add padding.
stop_token_indices = (codes == stop_token).nonzero()
if len(stop_token_indices) == 0:
if complain:
print(
"No stop tokens found in one of the generated voice clips. This typically means the spoken audio is "
"too long. In some cases, the output will still be good, though. Listen to it and if it is missing words, "
"try breaking up your input text."
)
return codes
codes[stop_token_indices] = 83
stm = stop_token_indices.min().item()
codes[stm:] = 83
if stm - 3 < codes.shape[0]:
codes[-3] = 45
codes[-2] = 45
codes[-1] = 248
return codes
def do_spectrogram_diffusion(
diffusion_model,
diffuser,
latents,
conditioning_latents,
temperature=1,
verbose=True,
):
"""
Uses the specified diffusion model to convert discrete codes into a spectrogram.
"""
with torch.no_grad():
output_seq_len = (
latents.shape[1] * 4 * 24000 // 22050
) # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
output_shape = (latents.shape[0], 100, output_seq_len)
precomputed_embeddings = diffusion_model.timestep_independent(
latents, conditioning_latents, output_seq_len, False
)
noise = torch.randn(output_shape, device=latents.device) * temperature
mel = diffuser.sample_loop(
diffusion_model,
output_shape,
noise=noise,
model_kwargs={"precomputed_aligned_embeddings": precomputed_embeddings},
progress=verbose,
)
return denormalize_tacotron_mel(mel)[:, :, :output_seq_len]
def classify_audio_clip(clip, model_dir):
"""
Returns whether or not Tortoises' classifier thinks the given clip came from Tortoise.
:param clip: torch tensor containing audio waveform data (get it from load_audio)
:return: True if the clip was classified as coming from Tortoise and false if it was classified as real.
"""
classifier = AudioMiniEncoderWithClassifierHead(
2,
spec_dim=1,
embedding_dim=512,
depth=5,
downsample_factor=4,
resnet_blocks=2,
attn_blocks=4,
num_attn_heads=4,
base_channels=32,
dropout=0,
kernel_size=5,
distribute_zero_label=False,
)
classifier.load_state_dict(torch.load(os.path.join(model_dir, "classifier.pth"), map_location=torch.device("cpu")))
clip = clip.cpu().unsqueeze(0)
results = F.softmax(classifier(clip), dim=-1)
return results[0][0]
def pick_best_batch_size_for_gpu():
"""
Tries to pick a batch size that will fit in your GPU. These sizes aren't guaranteed to work, but they should give
you a good shot.
"""
if torch.cuda.is_available():
_, available = torch.cuda.mem_get_info()
availableGb = available / (1024**3)
batch_size = 1
if availableGb > 14:
batch_size = 16
elif availableGb > 10:
batch_size = 8
elif availableGb > 7:
batch_size = 4
return batch_size
@dataclass
class TortoiseAudioConfig(Coqpit):
sample_rate: int = 22050
diffusion_sample_rate: int = 24000
output_sample_rate: int = 24000
@dataclass
class TortoiseArgs(Coqpit):
"""A dataclass to represent Tortoise model arguments that define the model structure.
Args:
autoregressive_batch_size (int): The size of the auto-regressive batch.
enable_redaction (bool, optional): Whether to enable redaction. Defaults to True.
high_vram (bool, optional): Whether to use high VRAM. Defaults to False.
kv_cache (bool, optional): Whether to use the kv_cache. Defaults to True.
ar_checkpoint (str, optional): The checkpoint for the autoregressive model. Defaults to None.
clvp_checkpoint (str, optional): The checkpoint for the ConditionalLatentVariablePerseq model. Defaults to None.
diff_checkpoint (str, optional): The checkpoint for the DiffTTS model. Defaults to None.
num_chars (int, optional): The maximum number of characters to generate. Defaults to 255.
vocoder (VocType, optional): The vocoder to use for synthesis. Defaults to VocConf.Univnet.
For UnifiedVoice model:
ar_max_mel_tokens (int, optional): The maximum mel tokens for the autoregressive model. Defaults to 604.
ar_max_text_tokens (int, optional): The maximum text tokens for the autoregressive model. Defaults to 402.
ar_max_conditioning_inputs (int, optional): The maximum conditioning inputs for the autoregressive model. Defaults to 2.
ar_layers (int, optional): The number of layers for the autoregressive model. Defaults to 30.
ar_model_dim (int, optional): The model dimension for the autoregressive model. Defaults to 1024.
ar_heads (int, optional): The number of heads for the autoregressive model. Defaults to 16.
ar_number_text_tokens (int, optional): The number of text tokens for the autoregressive model. Defaults to 255.
ar_start_text_token (int, optional): The start text token for the autoregressive model. Defaults to 255.
ar_checkpointing (bool, optional): Whether to use checkpointing for the autoregressive model. Defaults to False.
ar_train_solo_embeddings (bool, optional): Whether to train embeddings for the autoregressive model. Defaults to False.
For DiffTTS model:
diff_model_channels (int, optional): The number of channels for the DiffTTS model. Defaults to 1024.
diff_num_layers (int, optional): The number of layers for the DiffTTS model. Defaults to 10.
diff_in_channels (int, optional): The input channels for the DiffTTS model. Defaults to 100.
diff_out_channels (int, optional): The output channels for the DiffTTS model. Defaults to 200.
diff_in_latent_channels (int, optional): The input latent channels for the DiffTTS model. Defaults to 1024.
diff_in_tokens (int, optional): The input tokens for the DiffTTS model. Defaults to 8193.
diff_dropout (int, optional): The dropout percentage for the DiffTTS model. Defaults to 0.
diff_use_fp16 (bool, optional): Whether to use fp16 for the DiffTTS model. Defaults to False.
diff_num_heads (int, optional): The number of heads for the DiffTTS model. Defaults to 16.
diff_layer_drop (int, optional): The layer dropout percentage for the DiffTTS model. Defaults to 0.
diff_unconditioned_percentage (int, optional): The percentage of unconditioned inputs for the DiffTTS model. Defaults to 0.
For ConditionalLatentVariablePerseq model:
clvp_dim_text (int): The dimension of the text input for the CLVP module. Defaults to 768.
clvp_dim_speech (int): The dimension of the speech input for the CLVP module. Defaults to 768.
clvp_dim_latent (int): The dimension of the latent representation for the CLVP module. Defaults to 768.
clvp_num_text_tokens (int): The number of text tokens used by the CLVP module. Defaults to 256.
clvp_text_enc_depth (int): The depth of the text encoder in the CLVP module. Defaults to 20.
clvp_text_seq_len (int): The maximum sequence length of the text input for the CLVP module. Defaults to 350.
clvp_text_heads (int): The number of attention heads used by the text encoder in the CLVP module. Defaults to 12.
clvp_num_speech_tokens (int): The number of speech tokens used by the CLVP module. Defaults to 8192.
clvp_speech_enc_depth (int): The depth of the speech encoder in the CLVP module. Defaults to 20.
clvp_speech_heads (int): The number of attention heads used by the speech encoder in the CLVP module. Defaults to 12.
clvp_speech_seq_len (int): The maximum sequence length of the speech input for the CLVP module. Defaults to 430.
clvp_use_xformers (bool): A flag indicating whether the model uses transformers in the CLVP module. Defaults to True.
duration_const (int): A constant value used in the model. Defaults to 102400.
"""
autoregressive_batch_size: int = 1
enable_redaction: bool = False
high_vram: bool = False
kv_cache: bool = True
ar_checkpoint: str = None
clvp_checkpoint: str = None
diff_checkpoint: str = None
num_chars: int = 255
vocoder: VocType = VocConf.Univnet
# UnifiedVoice params
ar_max_mel_tokens: int = 604
ar_max_text_tokens: int = 402
ar_max_conditioning_inputs: int = 2
ar_layers: int = 30
ar_model_dim: int = 1024
ar_heads: int = 16
ar_number_text_tokens: int = 255
ar_start_text_token: int = 255
ar_checkpointing: bool = False
ar_train_solo_embeddings: bool = False
# DiffTTS params
diff_model_channels: int = 1024
diff_num_layers: int = 10
diff_in_channels: int = 100
diff_out_channels: int = 200
diff_in_latent_channels: int = 1024
diff_in_tokens: int = 8193
diff_dropout: int = 0
diff_use_fp16: bool = False
diff_num_heads: int = 16
diff_layer_drop: int = 0
diff_unconditioned_percentage: int = 0
# clvp params
clvp_dim_text: int = 768
clvp_dim_speech: int = 768
clvp_dim_latent: int = 768
clvp_num_text_tokens: int = 256
clvp_text_enc_depth: int = 20
clvp_text_seq_len: int = 350
clvp_text_heads: int = 12
clvp_num_speech_tokens: int = 8192
clvp_speech_enc_depth: int = 20
clvp_speech_heads: int = 12
clvp_speech_seq_len: int = 430
clvp_use_xformers: bool = True
# constants
duration_const: int = 102400
class Tortoise(BaseTTS):
"""Tortoise model class.
Currently only supports inference.
Examples:
>>> from TTS.tts.configs.tortoise_config import TortoiseConfig
>>> from TTS.tts.models.tortoise import Tortoise
>>> config = TortoiseConfig()
>>> model = Tortoise.inif_from_config(config)
>>> model.load_checkpoint(config, checkpoint_dir="paths/to/models_dir/", eval=True)
"""
def __init__(self, config: Coqpit):
super().__init__(config, ap=None, tokenizer=None)
self.mel_norm_path = None
self.config = config
self.ar_checkpoint = self.args.ar_checkpoint
self.diff_checkpoint = self.args.diff_checkpoint # TODO: check if this is even needed
self.models_dir = config.model_dir
self.autoregressive_batch_size = (
pick_best_batch_size_for_gpu()
if self.args.autoregressive_batch_size is None
else self.args.autoregressive_batch_size
)
self.enable_redaction = self.args.enable_redaction
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if self.enable_redaction:
self.aligner = Wav2VecAlignment()
self.tokenizer = VoiceBpeTokenizer()
self.autoregressive = UnifiedVoice(
max_mel_tokens=self.args.ar_max_mel_tokens,
max_text_tokens=self.args.ar_max_text_tokens,
max_conditioning_inputs=self.args.ar_max_conditioning_inputs,
layers=self.args.ar_layers,
model_dim=self.args.ar_model_dim,
heads=self.args.ar_heads,
number_text_tokens=self.args.ar_number_text_tokens,
start_text_token=self.args.ar_start_text_token,
checkpointing=self.args.ar_checkpointing,
train_solo_embeddings=self.args.ar_train_solo_embeddings,
).cpu()
self.diffusion = DiffusionTts(
model_channels=self.args.diff_model_channels,
num_layers=self.args.diff_num_layers,
in_channels=self.args.diff_in_channels,
out_channels=self.args.diff_out_channels,
in_latent_channels=self.args.diff_in_latent_channels,
in_tokens=self.args.diff_in_tokens,
dropout=self.args.diff_dropout,
use_fp16=self.args.diff_use_fp16,
num_heads=self.args.diff_num_heads,
layer_drop=self.args.diff_layer_drop,
unconditioned_percentage=self.args.diff_unconditioned_percentage,
).cpu()
self.clvp = CLVP(
dim_text=self.args.clvp_dim_text,
dim_speech=self.args.clvp_dim_speech,
dim_latent=self.args.clvp_dim_latent,
num_text_tokens=self.args.clvp_num_text_tokens,
text_enc_depth=self.args.clvp_text_enc_depth,
text_seq_len=self.args.clvp_text_seq_len,
text_heads=self.args.clvp_text_heads,
num_speech_tokens=self.args.clvp_num_speech_tokens,
speech_enc_depth=self.args.clvp_speech_enc_depth,
speech_heads=self.args.clvp_speech_heads,
speech_seq_len=self.args.clvp_speech_seq_len,
use_xformers=self.args.clvp_use_xformers,
).cpu()
self.vocoder = self.args.vocoder.value.constructor().cpu()
# Random latent generators (RLGs) are loaded lazily.
self.rlg_auto = None
self.rlg_diffusion = None
if self.args.high_vram:
self.autoregressive = self.autoregressive.to(self.device)
self.diffusion = self.diffusion.to(self.device)
self.clvp = self.clvp.to(self.device)
self.vocoder = self.vocoder.to(self.device)
self.high_vram = self.args.high_vram
@contextmanager
def temporary_cuda(self, model):
if self.high_vram:
yield model
else:
m = model.to(self.device)
yield m
m = model.cpu()
def get_conditioning_latents(
self,
voice_samples,
return_mels=False,
latent_averaging_mode=0,
original_tortoise=False,
):
"""
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
properties.
:param voice_samples: List of arbitrary reference clips, which should be *pairs* of torch tensors containing arbitrary kHz waveform data.
:param latent_averaging_mode: 0/1/2 for following modes:
0 - latents will be generated as in original tortoise, using ~4.27s from each voice sample, averaging latent across all samples
1 - latents will be generated using (almost) entire voice samples, averaged across all the ~4.27s chunks
2 - latents will be generated using (almost) entire voice samples, averaged per voice sample
"""
assert latent_averaging_mode in [
0,
1,
2,
], "latent_averaging mode has to be one of (0, 1, 2)"
with torch.no_grad():
voice_samples = [[v.to(self.device) for v in ls] for ls in voice_samples]
auto_conds = []
for ls in voice_samples:
auto_conds.append(format_conditioning(ls[0], device=self.device, mel_norm_file=self.mel_norm_path))
auto_conds = torch.stack(auto_conds, dim=1)
with self.temporary_cuda(self.autoregressive) as ar:
auto_latent = ar.get_conditioning(auto_conds)
diffusion_conds = []
DURS_CONST = self.args.duration_const
for ls in voice_samples:
# The diffuser operates at a sample rate of 24000 (except for the latent inputs)
sample = torchaudio.functional.resample(ls[0], 22050, 24000) if original_tortoise else ls[1]
if latent_averaging_mode == 0:
sample = pad_or_truncate(sample, DURS_CONST)
cond_mel = wav_to_univnet_mel(
sample.to(self.device),
do_normalization=False,
device=self.device,
)
diffusion_conds.append(cond_mel)
else:
from math import ceil
if latent_averaging_mode == 2:
temp_diffusion_conds = []
for chunk in range(ceil(sample.shape[1] / DURS_CONST)):
current_sample = sample[:, chunk * DURS_CONST : (chunk + 1) * DURS_CONST]
current_sample = pad_or_truncate(current_sample, DURS_CONST)
cond_mel = wav_to_univnet_mel(
current_sample.to(self.device),
do_normalization=False,
device=self.device,
)
if latent_averaging_mode == 1:
diffusion_conds.append(cond_mel)
elif latent_averaging_mode == 2:
temp_diffusion_conds.append(cond_mel)
if latent_averaging_mode == 2:
diffusion_conds.append(torch.stack(temp_diffusion_conds).mean(0))
diffusion_conds = torch.stack(diffusion_conds, dim=1)
with self.temporary_cuda(self.diffusion) as diffusion:
diffusion_latent = diffusion.get_conditioning(diffusion_conds)
if return_mels:
return auto_latent, diffusion_latent, auto_conds, diffusion_conds
return auto_latent, diffusion_latent
def get_random_conditioning_latents(self):
# Lazy-load the RLG models.
if self.rlg_auto is None:
self.rlg_auto = RandomLatentConverter(1024).eval()
self.rlg_auto.load_state_dict(
torch.load(
os.path.join(self.models_dir, "rlg_auto.pth"),
map_location=torch.device("cpu"),
)
)
self.rlg_diffusion = RandomLatentConverter(2048).eval()
self.rlg_diffusion.load_state_dict(
torch.load(
os.path.join(self.models_dir, "rlg_diffuser.pth"),
map_location=torch.device("cpu"),
)
)
with torch.no_grad():
return self.rlg_auto(torch.tensor([0.0])), self.rlg_diffusion(torch.tensor([0.0]))
def synthesize(self, text, config, speaker_id="random", voice_dirs=None, **kwargs):
"""Synthesize speech with the given input text.
Args:
text (str): Input text.
config (TortoiseConfig): Config with inference parameters.
speaker_id (str): One of the available speaker names. If `random`, it generates a random speaker.
voice_dirs (List[str]): List of paths that host reference audio files for speakers. Defaults to None.
**kwargs: Inference settings. See `inference()`.
Returns:
A dictionary of the output values with `wav` as output waveform, `deterministic_seed` as seed used at inference,
`text_input` as text token IDs after tokenizer, `voice_samples` as samples used for cloning, `conditioning_latents`
as latents used at inference.
"""
speaker_id = "random" if speaker_id is None else speaker_id
if voice_dirs is not None:
voice_dirs = [voice_dirs]
voice_samples, conditioning_latents = load_voice(speaker_id, voice_dirs)
else:
voice_samples, conditioning_latents = load_voice(speaker_id)
outputs = self.inference_with_config(
text, config, voice_samples=voice_samples, conditioning_latents=conditioning_latents, **kwargs
)
return_dict = {
"wav": outputs["wav"],
"deterministic_seed": outputs["deterministic_seed"],
"text_inputs": outputs["text"],
"voice_samples": outputs["voice_samples"],
"conditioning_latents": outputs["conditioning_latents"],
}
return return_dict
def inference_with_config(self, text, config, **kwargs):
"""
inference with config
#TODO describe in detail
"""
# Use generally found best tuning knobs for generation.
settings = {
"temperature": config.temperature,
"length_penalty": config.length_penalty,
"repetition_penalty": config.repetition_penalty,
"top_p": config.top_p,
"cond_free_k": config.cond_free_k,
"diffusion_temperature": config.diffusion_temperature,
"sampler": config.sampler,
}
# Presets are defined here.
presets = {
"single_sample": {
"num_autoregressive_samples": 8,
"diffusion_iterations": 10,
"sampler": "ddim",
},
"ultra_fast": {
"num_autoregressive_samples": 16,
"diffusion_iterations": 10,
"sampler": "ddim",
},
"ultra_fast_old": {
"num_autoregressive_samples": 16,
"diffusion_iterations": 30,
"cond_free": False,
},
"very_fast": {
"num_autoregressive_samples": 32,
"diffusion_iterations": 30,
"sampler": "dpm++2m",
},
"fast": {
"num_autoregressive_samples": 5,
"diffusion_iterations": 50,
"sampler": "ddim",
},
"fast_old": {"num_autoregressive_samples": 96, "diffusion_iterations": 80},
"standard": {
"num_autoregressive_samples": 5,
"diffusion_iterations": 200,
},
"high_quality": {
"num_autoregressive_samples": 256,
"diffusion_iterations": 400,
},
}
if "preset" in kwargs:
settings.update(presets[kwargs["preset"]])
kwargs.pop("preset")
settings.update(kwargs) # allow overriding of preset settings with kwargs
return self.inference(text, **settings)
def inference(
self,
text,
voice_samples=None,
conditioning_latents=None,
k=1,
verbose=True,
use_deterministic_seed=None,
return_deterministic_state=False,
latent_averaging_mode=0,
# autoregressive generation parameters follow
num_autoregressive_samples=16,
temperature=0.8,
length_penalty=1,
repetition_penalty=2.0,
top_p=0.8,
max_mel_tokens=500,
# diffusion generation parameters follow
diffusion_iterations=100,
cond_free=True,
cond_free_k=2,
diffusion_temperature=1.0,
sampler="ddim",
half=True,
original_tortoise=False,
**hf_generate_kwargs,
):
"""
This function produces an audio clip of the given text being spoken with the given reference voice.
Args:
text: (str) Text to be spoken.
voice_samples: (List[Tuple[torch.Tensor]]) List of an arbitrary number of reference clips, which should be tuple-pairs
of torch tensors containing arbitrary kHz waveform data.
conditioning_latents: (Tuple[autoregressive_conditioning_latent, diffusion_conditioning_latent]) A tuple of
(autoregressive_conditioning_latent, diffusion_conditioning_latent), which can be provided in lieu
of voice_samples. This is ignored unless `voice_samples=None`. Conditioning latents can be retrieved
via `get_conditioning_latents()`.
k: (int) The number of returned clips. The most likely (as determined by Tortoises' CLVP model) clips are returned.
latent_averaging_mode: (int) 0/1/2 for following modes:
0 - latents will be generated as in original tortoise, using ~4.27s from each voice sample, averaging latent across all samples
1 - latents will be generated using (almost) entire voice samples, averaged across all the ~4.27s chunks
2 - latents will be generated using (almost) entire voice samples, averaged per voice sample
verbose: (bool) Whether or not to print log messages indicating the progress of creating a clip. Default=true.
num_autoregressive_samples: (int) Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
As Tortoise is a probabilistic model, more samples means a higher probability of creating something "great".
temperature: (float) The softmax temperature of the autoregressive model.
length_penalty: (float) A length penalty applied to the autoregressive decoder. Higher settings causes the model to produce more terse outputs.
repetition_penalty: (float) A penalty that prevents the autoregressive decoder from repeating itself during decoding. Can be used to reduce
the incidence of long silences or "uhhhhhhs", etc.
top_p: (float) P value used in nucleus sampling. (0,1]. Lower values mean the decoder produces more "likely" (aka boring) outputs.
max_mel_tokens: (int) Restricts the output length. (0,600] integer. Each unit is 1/20 of a second.
typical_sampling: (bool) Turns typical sampling on or off. This sampling mode is discussed in this paper: https://arxiv.org/abs/2202.00666
I was interested in the premise, but the results were not as good as I was hoping. This is off by default, but could use some tuning.
typical_mass: (float) The typical_mass parameter from the typical_sampling algorithm.
diffusion_iterations: (int) Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively
refine the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better, however.
cond_free: (bool) Whether or not to perform conditioning-free diffusion. Conditioning-free diffusion performs two forward passes for
each diffusion step: one with the outputs of the autoregressive model and one with no conditioning priors. The output of the two
is blended according to the cond_free_k value below. Conditioning-free diffusion is the real deal, and dramatically improves realism.
cond_free_k: (float) Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
diffusion_temperature: (float) Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
are the "mean" prediction of the diffusion network and will sound bland and smeared.
hf_generate_kwargs: (**kwargs) The huggingface Transformers generate API is used for the autoregressive transformer.
Extra keyword args fed to this function get forwarded directly to that API. Documentation
here: https://huggingface.co/docs/transformers/internal/generation_utils
Returns:
Generated audio clip(s) as a torch tensor. Shape 1,S if k=1 else, (k,1,S) where S is the sample length.
Sample rate is 24kHz.
"""
deterministic_seed = deterministic_state(seed=use_deterministic_seed)
text_tokens = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0).to(self.device)
text_tokens = F.pad(text_tokens, (0, 1)) # This may not be necessary.
assert (
text_tokens.shape[-1] < 400
), "Too much text provided. Break the text up into separate segments and re-try inference."
if voice_samples is not None:
(
auto_conditioning,
diffusion_conditioning,
_,
_,
) = self.get_conditioning_latents(
voice_samples,
return_mels=True,
latent_averaging_mode=latent_averaging_mode,
original_tortoise=original_tortoise,
)
elif conditioning_latents is not None:
auto_conditioning, diffusion_conditioning = conditioning_latents
else:
(
auto_conditioning,
diffusion_conditioning,
) = self.get_random_conditioning_latents()
auto_conditioning = auto_conditioning.to(self.device)
diffusion_conditioning = diffusion_conditioning.to(self.device)
diffuser = load_discrete_vocoder_diffuser(
desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k, sampler=sampler
)
# in the case of single_sample,
orig_batch_size = self.autoregressive_batch_size
while num_autoregressive_samples % self.autoregressive_batch_size:
self.autoregressive_batch_size //= 2
with torch.no_grad():
samples = []
num_batches = num_autoregressive_samples // self.autoregressive_batch_size
stop_mel_token = self.autoregressive.stop_mel_token
calm_token = (
83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
)
self.autoregressive = self.autoregressive.to(self.device)
if verbose:
print("Generating autoregressive samples..")
with self.temporary_cuda(self.autoregressive) as autoregressive, torch.autocast(
device_type="cuda", dtype=torch.float16, enabled=half
):
for b in tqdm(range(num_batches), disable=not verbose):
codes = autoregressive.inference_speech(
auto_conditioning,
text_tokens,
do_sample=True,
top_p=top_p,
temperature=temperature,
num_return_sequences=self.autoregressive_batch_size,
length_penalty=length_penalty,
repetition_penalty=repetition_penalty,
max_generate_length=max_mel_tokens,
**hf_generate_kwargs,
)
padding_needed = max_mel_tokens - codes.shape[1]
codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
samples.append(codes)
self.autoregressive_batch_size = orig_batch_size # in the case of single_sample
clip_results = []
with self.temporary_cuda(self.clvp) as clvp, torch.autocast(
device_type="cuda", dtype=torch.float16, enabled=half
):
for batch in tqdm(samples, disable=not verbose):
for i in range(batch.shape[0]):
batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
clvp_res = clvp(
text_tokens.repeat(batch.shape[0], 1),
batch,
return_loss=False,
)
clip_results.append(clvp_res)
clip_results = torch.cat(clip_results, dim=0)
samples = torch.cat(samples, dim=0)
best_results = samples[torch.topk(clip_results, k=k).indices]
del samples
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
# results, but will increase memory usage.
with self.temporary_cuda(self.autoregressive) as autoregressive:
best_latents = autoregressive(
auto_conditioning.repeat(k, 1),
text_tokens.repeat(k, 1),
torch.tensor([text_tokens.shape[-1]], device=text_tokens.device),
best_results,
torch.tensor(
[best_results.shape[-1] * self.autoregressive.mel_length_compression],
device=text_tokens.device,
),
return_latent=True,
clip_inputs=False,
)
del auto_conditioning
if verbose:
print("Transforming autoregressive outputs into audio..")
wav_candidates = []
for b in range(best_results.shape[0]):
codes = best_results[b].unsqueeze(0)
latents = best_latents[b].unsqueeze(0)
# Find the first occurrence of the "calm" token and trim the codes to that.
ctokens = 0
for code in range(codes.shape[-1]):
if codes[0, code] == calm_token:
ctokens += 1
else:
ctokens = 0
if ctokens > 8: # 8 tokens gives the diffusion model some "breathing room" to terminate speech.
latents = latents[:, :code]
break
with self.temporary_cuda(self.diffusion) as diffusion:
mel = do_spectrogram_diffusion(
diffusion,
diffuser,
latents,
diffusion_conditioning,
temperature=diffusion_temperature,
verbose=verbose,
)
with self.temporary_cuda(self.vocoder) as vocoder:
wav = vocoder.inference(mel)
wav_candidates.append(wav.cpu())
def potentially_redact(clip, text):
if self.enable_redaction:
return self.aligner.redact(clip.squeeze(1), text).unsqueeze(1)
return clip
wav_candidates = [potentially_redact(wav_candidate, text) for wav_candidate in wav_candidates]
if len(wav_candidates) > 1:
res = wav_candidates
else:
res = wav_candidates[0]
return_dict = {
"wav": res,
"deterministic_seed": None,
"text": None,
"voice_samples": None,
"conditioning_latents": None,
}
if return_deterministic_state:
return_dict = {
"wav": res,
"deterministic_seed": deterministic_seed,
"text": text,
"voice_samples": voice_samples,
"conditioning_latents": conditioning_latents,
}
return return_dict
def forward(self):
raise NotImplementedError("Tortoise Training is not implemented")
def eval_step(self):
raise NotImplementedError("Tortoise Training is not implemented")
@staticmethod
def init_from_config(config: "TortoiseConfig", **kwargs): # pylint: disable=unused-argument
return Tortoise(config)
def load_checkpoint(
self,
config,
checkpoint_dir,
ar_checkpoint_path=None,
diff_checkpoint_path=None,
clvp_checkpoint_path=None,
vocoder_checkpoint_path=None,
eval=False,
strict=True,
**kwargs,
): # pylint: disable=unused-argument, redefined-builtin
"""Load a model checkpoints from a directory. This model is with multiple checkpoint files and it
expects to have all the files to be under the given `checkpoint_dir` with the rigth names.
If eval is True, set the model to eval mode.
Args:
config (TortoiseConfig): The model config.
checkpoint_dir (str): The directory where the checkpoints are stored.
ar_checkpoint_path (str, optional): The path to the autoregressive checkpoint. Defaults to None.
diff_checkpoint_path (str, optional): The path to the diffusion checkpoint. Defaults to None.
clvp_checkpoint_path (str, optional): The path to the CLVP checkpoint. Defaults to None.
vocoder_checkpoint_path (str, optional): The path to the vocoder checkpoint. Defaults to None.
eval (bool, optional): Whether to set the model to eval mode. Defaults to False.
strict (bool, optional): Whether to load the model strictly. Defaults to True.
"""
if self.models_dir is None:
self.models_dir = checkpoint_dir
ar_path = ar_checkpoint_path or os.path.join(checkpoint_dir, "autoregressive.pth")
diff_path = diff_checkpoint_path or os.path.join(checkpoint_dir, "diffusion_decoder.pth")
clvp_path = clvp_checkpoint_path or os.path.join(checkpoint_dir, "clvp2.pth")
vocoder_checkpoint_path = vocoder_checkpoint_path or os.path.join(checkpoint_dir, "vocoder.pth")
self.mel_norm_path = os.path.join(checkpoint_dir, "mel_norms.pth")
if os.path.exists(ar_path):
# remove keys from the checkpoint that are not in the model
checkpoint = torch.load(ar_path, map_location=torch.device("cpu"))
# strict set False
# due to removed `bias` and `masked_bias` changes in Transformers
self.autoregressive.load_state_dict(checkpoint, strict=False)
if os.path.exists(diff_path):
self.diffusion.load_state_dict(torch.load(diff_path), strict=strict)
if os.path.exists(clvp_path):
self.clvp.load_state_dict(torch.load(clvp_path), strict=strict)
if os.path.exists(vocoder_checkpoint_path):
self.vocoder.load_state_dict(
config.model_args.vocoder.value.optionally_index(
torch.load(
vocoder_checkpoint_path,
map_location=torch.device("cpu"),
)
)
)
if eval:
self.autoregressive.post_init_gpt2_config(self.args.kv_cache)
self.autoregressive.eval()
self.diffusion.eval()
self.clvp.eval()
self.vocoder.eval()
def train_step(self):
raise NotImplementedError("Tortoise Training is not implemented")
File diff suppressed because it is too large Load Diff
+791
View File
@@ -0,0 +1,791 @@
import os
from dataclasses import dataclass
import librosa
import torch
import torch.nn.functional as F
import torchaudio
from coqpit import Coqpit
from TTS.tts.layers.xtts.gpt import GPT
from TTS.tts.layers.xtts.hifigan_decoder import HifiDecoder
from TTS.tts.layers.xtts.stream_generator import init_stream_support
from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence
from TTS.tts.layers.xtts.xtts_manager import SpeakerManager, LanguageManager
from TTS.tts.models.base_tts import BaseTTS
from TTS.utils.io import load_fsspec
init_stream_support()
def wav_to_mel_cloning(
wav,
mel_norms_file="../experiments/clips_mel_norms.pth",
mel_norms=None,
device=torch.device("cpu"),
n_fft=4096,
hop_length=1024,
win_length=4096,
power=2,
normalized=False,
sample_rate=22050,
f_min=0,
f_max=8000,
n_mels=80,
):
"""
Convert waveform to mel-spectrogram with hard-coded parameters for cloning.
Args:
wav (torch.Tensor): Input waveform tensor.
mel_norms_file (str): Path to mel-spectrogram normalization file.
mel_norms (torch.Tensor): Mel-spectrogram normalization tensor.
device (torch.device): Device to use for computation.
Returns:
torch.Tensor: Mel-spectrogram tensor.
"""
mel_stft = torchaudio.transforms.MelSpectrogram(
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
power=power,
normalized=normalized,
sample_rate=sample_rate,
f_min=f_min,
f_max=f_max,
n_mels=n_mels,
norm="slaney",
).to(device)
wav = wav.to(device)
mel = mel_stft(wav)
mel = torch.log(torch.clamp(mel, min=1e-5))
if mel_norms is None:
mel_norms = torch.load(mel_norms_file, map_location=device)
mel = mel / mel_norms.unsqueeze(0).unsqueeze(-1)
return mel
def load_audio(audiopath, sampling_rate):
# better load setting following: https://github.com/faroit/python_audio_loading_benchmark
# torchaudio should chose proper backend to load audio depending on platform
audio, lsr = torchaudio.load(audiopath)
# stereo to mono if needed
if audio.size(0) != 1:
audio = torch.mean(audio, dim=0, keepdim=True)
if lsr != sampling_rate:
audio = torchaudio.functional.resample(audio, lsr, sampling_rate)
# Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk.
# '10' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds.
if torch.any(audio > 10) or not torch.any(audio < 0):
print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}")
# clip audio invalid values
audio.clip_(-1, 1)
return audio
def pad_or_truncate(t, length):
"""
Ensure a given tensor t has a specified sequence length by either padding it with zeros or clipping it.
Args:
t (torch.Tensor): The input tensor to be padded or truncated.
length (int): The desired length of the tensor.
Returns:
torch.Tensor: The padded or truncated tensor.
"""
tp = t[..., :length]
if t.shape[-1] == length:
tp = t
elif t.shape[-1] < length:
tp = F.pad(t, (0, length - t.shape[-1]))
return tp
@dataclass
class XttsAudioConfig(Coqpit):
"""
Configuration class for audio-related parameters in the XTTS model.
Args:
sample_rate (int): The sample rate in which the GPT operates.
output_sample_rate (int): The sample rate of the output audio waveform.
"""
sample_rate: int = 22050
output_sample_rate: int = 24000
@dataclass
class XttsArgs(Coqpit):
"""A dataclass to represent XTTS model arguments that define the model structure.
Args:
gpt_batch_size (int): The size of the auto-regressive batch.
enable_redaction (bool, optional): Whether to enable redaction. Defaults to True.
kv_cache (bool, optional): Whether to use the kv_cache. Defaults to True.
gpt_checkpoint (str, optional): The checkpoint for the autoregressive model. Defaults to None.
clvp_checkpoint (str, optional): The checkpoint for the ConditionalLatentVariablePerseq model. Defaults to None.
decoder_checkpoint (str, optional): The checkpoint for the DiffTTS model. Defaults to None.
num_chars (int, optional): The maximum number of characters to generate. Defaults to 255.
For GPT model:
gpt_max_audio_tokens (int, optional): The maximum mel tokens for the autoregressive model. Defaults to 604.
gpt_max_text_tokens (int, optional): The maximum text tokens for the autoregressive model. Defaults to 402.
gpt_max_prompt_tokens (int, optional): The maximum prompt tokens or the autoregressive model. Defaults to 70.
gpt_layers (int, optional): The number of layers for the autoregressive model. Defaults to 30.
gpt_n_model_channels (int, optional): The model dimension for the autoregressive model. Defaults to 1024.
gpt_n_heads (int, optional): The number of heads for the autoregressive model. Defaults to 16.
gpt_number_text_tokens (int, optional): The number of text tokens for the autoregressive model. Defaults to 255.
gpt_start_text_token (int, optional): The start text token for the autoregressive model. Defaults to 255.
gpt_checkpointing (bool, optional): Whether to use checkpointing for the autoregressive model. Defaults to False.
gpt_train_solo_embeddings (bool, optional): Whether to train embeddings for the autoregressive model. Defaults to False.
gpt_code_stride_len (int, optional): The hop_size of dvae and consequently of the gpt output. Defaults to 1024.
gpt_use_masking_gt_prompt_approach (bool, optional): If True, it will use ground truth as prompt and it will mask the loss to avoid repetition. Defaults to True.
gpt_use_perceiver_resampler (bool, optional): If True, it will use perceiver resampler from flamingo paper - https://arxiv.org/abs/2204.14198. Defaults to False.
"""
gpt_batch_size: int = 1
enable_redaction: bool = False
kv_cache: bool = True
gpt_checkpoint: str = None
clvp_checkpoint: str = None
decoder_checkpoint: str = None
num_chars: int = 255
# XTTS GPT Encoder params
tokenizer_file: str = ""
gpt_max_audio_tokens: int = 605
gpt_max_text_tokens: int = 402
gpt_max_prompt_tokens: int = 70
gpt_layers: int = 30
gpt_n_model_channels: int = 1024
gpt_n_heads: int = 16
gpt_number_text_tokens: int = None
gpt_start_text_token: int = None
gpt_stop_text_token: int = None
gpt_num_audio_tokens: int = 8194
gpt_start_audio_token: int = 8192
gpt_stop_audio_token: int = 8193
gpt_code_stride_len: int = 1024
gpt_use_masking_gt_prompt_approach: bool = True
gpt_use_perceiver_resampler: bool = False
# HifiGAN Decoder params
input_sample_rate: int = 22050
output_sample_rate: int = 24000
output_hop_length: int = 256
decoder_input_dim: int = 1024
d_vector_dim: int = 512
cond_d_vector_in_each_upsampling_layer: bool = True
# constants
duration_const: int = 102400
class Xtts(BaseTTS):
"""ⓍTTS model implementation.
❗ Currently it only supports inference.
Examples:
>>> from TTS.tts.configs.xtts_config import XttsConfig
>>> from TTS.tts.models.xtts import Xtts
>>> config = XttsConfig()
>>> model = Xtts.inif_from_config(config)
>>> model.load_checkpoint(config, checkpoint_dir="paths/to/models_dir/", eval=True)
"""
def __init__(self, config: Coqpit):
super().__init__(config, ap=None, tokenizer=None)
self.mel_stats_path = None
self.config = config
self.gpt_checkpoint = self.args.gpt_checkpoint
self.decoder_checkpoint = self.args.decoder_checkpoint # TODO: check if this is even needed
self.models_dir = config.model_dir
self.gpt_batch_size = self.args.gpt_batch_size
self.tokenizer = VoiceBpeTokenizer()
self.gpt = None
self.init_models()
self.register_buffer("mel_stats", torch.ones(80))
def init_models(self):
"""Initialize the models. We do it here since we need to load the tokenizer first."""
if self.tokenizer.tokenizer is not None:
self.args.gpt_number_text_tokens = self.tokenizer.get_number_tokens()
self.args.gpt_start_text_token = self.tokenizer.tokenizer.token_to_id("[START]")
self.args.gpt_stop_text_token = self.tokenizer.tokenizer.token_to_id("[STOP]")
if self.args.gpt_number_text_tokens:
self.gpt = GPT(
layers=self.args.gpt_layers,
model_dim=self.args.gpt_n_model_channels,
start_text_token=self.args.gpt_start_text_token,
stop_text_token=self.args.gpt_stop_text_token,
heads=self.args.gpt_n_heads,
max_text_tokens=self.args.gpt_max_text_tokens,
max_mel_tokens=self.args.gpt_max_audio_tokens,
max_prompt_tokens=self.args.gpt_max_prompt_tokens,
number_text_tokens=self.args.gpt_number_text_tokens,
num_audio_tokens=self.args.gpt_num_audio_tokens,
start_audio_token=self.args.gpt_start_audio_token,
stop_audio_token=self.args.gpt_stop_audio_token,
use_perceiver_resampler=self.args.gpt_use_perceiver_resampler,
code_stride_len=self.args.gpt_code_stride_len,
)
self.hifigan_decoder = HifiDecoder(
input_sample_rate=self.args.input_sample_rate,
output_sample_rate=self.args.output_sample_rate,
output_hop_length=self.args.output_hop_length,
ar_mel_length_compression=self.args.gpt_code_stride_len,
decoder_input_dim=self.args.decoder_input_dim,
d_vector_dim=self.args.d_vector_dim,
cond_d_vector_in_each_upsampling_layer=self.args.cond_d_vector_in_each_upsampling_layer,
)
@property
def device(self):
return next(self.parameters()).device
@torch.inference_mode()
def get_gpt_cond_latents(self, audio, sr, length: int = 30, chunk_length: int = 6):
"""Compute the conditioning latents for the GPT model from the given audio.
Args:
audio (tensor): audio tensor.
sr (int): Sample rate of the audio.
length (int): Length of the audio in seconds. If < 0, use the whole audio. Defaults to 30.
chunk_length (int): Length of the audio chunks in seconds. When `length == chunk_length`, the whole audio
is being used without chunking. It must be < `length`. Defaults to 6.
"""
if sr != 22050:
audio = torchaudio.functional.resample(audio, sr, 22050)
if length > 0:
audio = audio[:, : 22050 * length]
if self.args.gpt_use_perceiver_resampler:
style_embs = []
for i in range(0, audio.shape[1], 22050 * chunk_length):
audio_chunk = audio[:, i : i + 22050 * chunk_length]
# if the chunk is too short ignore it
if audio_chunk.size(-1) < 22050 * 0.33:
continue
mel_chunk = wav_to_mel_cloning(
audio_chunk,
mel_norms=self.mel_stats.cpu(),
n_fft=2048,
hop_length=256,
win_length=1024,
power=2,
normalized=False,
sample_rate=22050,
f_min=0,
f_max=8000,
n_mels=80,
)
style_emb = self.gpt.get_style_emb(mel_chunk.to(self.device), None)
style_embs.append(style_emb)
# mean style embedding
cond_latent = torch.stack(style_embs).mean(dim=0)
else:
mel = wav_to_mel_cloning(
audio,
mel_norms=self.mel_stats.cpu(),
n_fft=4096,
hop_length=1024,
win_length=4096,
power=2,
normalized=False,
sample_rate=22050,
f_min=0,
f_max=8000,
n_mels=80,
)
cond_latent = self.gpt.get_style_emb(mel.to(self.device))
return cond_latent.transpose(1, 2)
@torch.inference_mode()
def get_speaker_embedding(self, audio, sr):
audio_16k = torchaudio.functional.resample(audio, sr, 16000)
return (
self.hifigan_decoder.speaker_encoder.forward(audio_16k.to(self.device), l2_norm=True)
.unsqueeze(-1)
.to(self.device)
)
@torch.inference_mode()
def get_conditioning_latents(
self,
audio_path,
max_ref_length=30,
gpt_cond_len=6,
gpt_cond_chunk_len=6,
librosa_trim_db=None,
sound_norm_refs=False,
load_sr=22050,
):
"""Get the conditioning latents for the GPT model from the given audio.
Args:
audio_path (str or List[str]): Path to reference audio file(s).
max_ref_length (int): Maximum length of each reference audio in seconds. Defaults to 30.
gpt_cond_len (int): Length of the audio used for gpt latents. Defaults to 6.
gpt_cond_chunk_len (int): Chunk length used for gpt latents. It must be <= gpt_conf_len. Defaults to 6.
librosa_trim_db (int, optional): Trim the audio using this value. If None, not trimming. Defaults to None.
sound_norm_refs (bool, optional): Whether to normalize the audio. Defaults to False.
load_sr (int, optional): Sample rate to load the audio. Defaults to 24000.
"""
# deal with multiples references
if not isinstance(audio_path, list):
audio_paths = [audio_path]
else:
audio_paths = audio_path
speaker_embeddings = []
audios = []
speaker_embedding = None
for file_path in audio_paths:
audio = load_audio(file_path, load_sr)
audio = audio[:, : load_sr * max_ref_length].to(self.device)
if sound_norm_refs:
audio = (audio / torch.abs(audio).max()) * 0.75
if librosa_trim_db is not None:
audio = librosa.effects.trim(audio, top_db=librosa_trim_db)[0]
# compute latents for the decoder
speaker_embedding = self.get_speaker_embedding(audio, load_sr)
speaker_embeddings.append(speaker_embedding)
audios.append(audio)
# merge all the audios and compute the latents for the gpt
full_audio = torch.cat(audios, dim=-1)
gpt_cond_latents = self.get_gpt_cond_latents(
full_audio, load_sr, length=gpt_cond_len, chunk_length=gpt_cond_chunk_len
) # [1, 1024, T]
if speaker_embeddings:
speaker_embedding = torch.stack(speaker_embeddings)
speaker_embedding = speaker_embedding.mean(dim=0)
return gpt_cond_latents, speaker_embedding
def synthesize(self, text, config, speaker_wav, language, speaker_id=None, **kwargs):
"""Synthesize speech with the given input text.
Args:
text (str): Input text.
config (XttsConfig): Config with inference parameters.
speaker_wav (list): List of paths to the speaker audio files to be used for cloning.
language (str): Language ID of the speaker.
**kwargs: Inference settings. See `inference()`.
Returns:
A dictionary of the output values with `wav` as output waveform, `deterministic_seed` as seed used at inference,
`text_input` as text token IDs after tokenizer, `voice_samples` as samples used for cloning, `conditioning_latents`
as latents used at inference.
"""
assert (
"zh-cn" if language == "zh" else language in self.config.languages
), f" ❗ Language {language} is not supported. Supported languages are {self.config.languages}"
# Use generally found best tuning knobs for generation.
settings = {
"temperature": config.temperature,
"length_penalty": config.length_penalty,
"repetition_penalty": config.repetition_penalty,
"top_k": config.top_k,
"top_p": config.top_p,
}
settings.update(kwargs) # allow overriding of preset settings with kwargs
if speaker_id is not None:
gpt_cond_latent, speaker_embedding = self.speaker_manager.speakers[speaker_id].values()
return self.inference(text, language, gpt_cond_latent, speaker_embedding, **settings)
settings.update({
"gpt_cond_len": config.gpt_cond_len,
"gpt_cond_chunk_len": config.gpt_cond_chunk_len,
"max_ref_len": config.max_ref_len,
"sound_norm_refs": config.sound_norm_refs,
})
return self.full_inference(text, speaker_wav, language, **settings)
@torch.inference_mode()
def full_inference(
self,
text,
ref_audio_path,
language,
# GPT inference
temperature=0.75,
length_penalty=1.0,
repetition_penalty=10.0,
top_k=50,
top_p=0.85,
do_sample=True,
# Cloning
gpt_cond_len=30,
gpt_cond_chunk_len=6,
max_ref_len=10,
sound_norm_refs=False,
**hf_generate_kwargs,
):
"""
This function produces an audio clip of the given text being spoken with the given reference voice.
Args:
text: (str) Text to be spoken.
ref_audio_path: (str) Path to a reference audio file to be used for cloning. This audio file should be >3
seconds long.
language: (str) Language of the voice to be generated.
temperature: (float) The softmax temperature of the autoregressive model. Defaults to 0.65.
length_penalty: (float) A length penalty applied to the autoregressive decoder. Higher settings causes the
model to produce more terse outputs. Defaults to 1.0.
repetition_penalty: (float) A penalty that prevents the autoregressive decoder from repeating itself during
decoding. Can be used to reduce the incidence of long silences or "uhhhhhhs", etc. Defaults to 2.0.
top_k: (int) K value used in top-k sampling. [0,inf]. Lower values mean the decoder produces more "likely"
(aka boring) outputs. Defaults to 50.
top_p: (float) P value used in nucleus sampling. (0,1]. Lower values mean the decoder produces more "likely"
(aka boring) outputs. Defaults to 0.8.
gpt_cond_len: (int) Length of the audio used for cloning. If audio is shorter, then audio length is used
else the first `gpt_cond_len` secs is used. Defaults to 30 seconds.
gpt_cond_chunk_len: (int) Chunk length used for cloning. It must be <= `gpt_cond_len`.
If gpt_cond_len == gpt_cond_chunk_len, no chunking. Defaults to 6 seconds.
hf_generate_kwargs: (**kwargs) The huggingface Transformers generate API is used for the autoregressive
transformer. Extra keyword args fed to this function get forwarded directly to that API. Documentation
here: https://huggingface.co/docs/transformers/internal/generation_utils
Returns:
Generated audio clip(s) as a torch tensor. Shape 1,S if k=1 else, (k,1,S) where S is the sample length.
Sample rate is 24kHz.
"""
(gpt_cond_latent, speaker_embedding) = self.get_conditioning_latents(
audio_path=ref_audio_path,
gpt_cond_len=gpt_cond_len,
gpt_cond_chunk_len=gpt_cond_chunk_len,
max_ref_length=max_ref_len,
sound_norm_refs=sound_norm_refs,
)
return self.inference(
text,
language,
gpt_cond_latent,
speaker_embedding,
temperature=temperature,
length_penalty=length_penalty,
repetition_penalty=repetition_penalty,
top_k=top_k,
top_p=top_p,
do_sample=do_sample,
**hf_generate_kwargs,
)
@torch.inference_mode()
def inference(
self,
text,
language,
gpt_cond_latent,
speaker_embedding,
# GPT inference
temperature=0.75,
length_penalty=1.0,
repetition_penalty=10.0,
top_k=50,
top_p=0.85,
do_sample=True,
num_beams=1,
speed=1.0,
enable_text_splitting=False,
**hf_generate_kwargs,
):
language = language.split("-")[0] # remove the country code
length_scale = 1.0 / max(speed, 0.05)
gpt_cond_latent = gpt_cond_latent.to(self.device)
speaker_embedding = speaker_embedding.to(self.device)
if enable_text_splitting:
text = split_sentence(text, language, self.tokenizer.char_limits[language])
else:
text = [text]
wavs = []
gpt_latents_list = []
for sent in text:
sent = sent.strip().lower()
text_tokens = torch.IntTensor(self.tokenizer.encode(sent, lang=language)).unsqueeze(0).to(self.device)
assert (
text_tokens.shape[-1] < self.args.gpt_max_text_tokens
), " ❗ XTTS can only generate text with a maximum of 400 tokens."
with torch.no_grad():
gpt_codes = self.gpt.generate(
cond_latents=gpt_cond_latent,
text_inputs=text_tokens,
input_tokens=None,
do_sample=do_sample,
top_p=top_p,
top_k=top_k,
temperature=temperature,
num_return_sequences=self.gpt_batch_size,
num_beams=num_beams,
length_penalty=length_penalty,
repetition_penalty=repetition_penalty,
output_attentions=False,
**hf_generate_kwargs,
)
expected_output_len = torch.tensor(
[gpt_codes.shape[-1] * self.gpt.code_stride_len], device=text_tokens.device
)
text_len = torch.tensor([text_tokens.shape[-1]], device=self.device)
gpt_latents = self.gpt(
text_tokens,
text_len,
gpt_codes,
expected_output_len,
cond_latents=gpt_cond_latent,
return_attentions=False,
return_latent=True,
)
if length_scale != 1.0:
gpt_latents = F.interpolate(
gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear"
).transpose(1, 2)
gpt_latents_list.append(gpt_latents.cpu())
wavs.append(self.hifigan_decoder(gpt_latents, g=speaker_embedding).cpu().squeeze())
return {
"wav": torch.cat(wavs, dim=0).numpy(),
"gpt_latents": torch.cat(gpt_latents_list, dim=1).numpy(),
"speaker_embedding": speaker_embedding,
}
def handle_chunks(self, wav_gen, wav_gen_prev, wav_overlap, overlap_len):
"""Handle chunk formatting in streaming mode"""
wav_chunk = wav_gen[:-overlap_len]
if wav_gen_prev is not None:
wav_chunk = wav_gen[(wav_gen_prev.shape[0] - overlap_len) : -overlap_len]
if wav_overlap is not None:
# cross fade the overlap section
if overlap_len > len(wav_chunk):
# wav_chunk is smaller than overlap_len, pass on last wav_gen
if wav_gen_prev is not None:
wav_chunk = wav_gen[(wav_gen_prev.shape[0] - overlap_len) :]
else:
# not expecting will hit here as problem happens on last chunk
wav_chunk = wav_gen[-overlap_len:]
return wav_chunk, wav_gen, None
else:
crossfade_wav = wav_chunk[:overlap_len]
crossfade_wav = crossfade_wav * torch.linspace(0.0, 1.0, overlap_len).to(crossfade_wav.device)
wav_chunk[:overlap_len] = wav_overlap * torch.linspace(1.0, 0.0, overlap_len).to(wav_overlap.device)
wav_chunk[:overlap_len] += crossfade_wav
wav_overlap = wav_gen[-overlap_len:]
wav_gen_prev = wav_gen
return wav_chunk, wav_gen_prev, wav_overlap
@torch.inference_mode()
def inference_stream(
self,
text,
language,
gpt_cond_latent,
speaker_embedding,
# Streaming
stream_chunk_size=20,
overlap_wav_len=1024,
# GPT inference
temperature=0.75,
length_penalty=1.0,
repetition_penalty=10.0,
top_k=50,
top_p=0.85,
do_sample=True,
speed=1.0,
enable_text_splitting=False,
**hf_generate_kwargs,
):
language = language.split("-")[0] # remove the country code
length_scale = 1.0 / max(speed, 0.05)
gpt_cond_latent = gpt_cond_latent.to(self.device)
speaker_embedding = speaker_embedding.to(self.device)
if enable_text_splitting:
text = split_sentence(text, language, self.tokenizer.char_limits[language])
else:
text = [text]
for sent in text:
sent = sent.strip().lower()
text_tokens = torch.IntTensor(self.tokenizer.encode(sent, lang=language)).unsqueeze(0).to(self.device)
assert (
text_tokens.shape[-1] < self.args.gpt_max_text_tokens
), " ❗ XTTS can only generate text with a maximum of 400 tokens."
fake_inputs = self.gpt.compute_embeddings(
gpt_cond_latent.to(self.device),
text_tokens,
)
gpt_generator = self.gpt.get_generator(
fake_inputs=fake_inputs,
top_k=top_k,
top_p=top_p,
temperature=temperature,
do_sample=do_sample,
num_beams=1,
num_return_sequences=1,
length_penalty=float(length_penalty),
repetition_penalty=float(repetition_penalty),
output_attentions=False,
output_hidden_states=True,
**hf_generate_kwargs,
)
last_tokens = []
all_latents = []
wav_gen_prev = None
wav_overlap = None
is_end = False
while not is_end:
try:
x, latent = next(gpt_generator)
last_tokens += [x]
all_latents += [latent]
except StopIteration:
is_end = True
if is_end or (stream_chunk_size > 0 and len(last_tokens) >= stream_chunk_size):
gpt_latents = torch.cat(all_latents, dim=0)[None, :]
if length_scale != 1.0:
gpt_latents = F.interpolate(
gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear"
).transpose(1, 2)
wav_gen = self.hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device))
wav_chunk, wav_gen_prev, wav_overlap = self.handle_chunks(
wav_gen.squeeze(), wav_gen_prev, wav_overlap, overlap_wav_len
)
last_tokens = []
yield wav_chunk
def forward(self):
raise NotImplementedError(
"XTTS has a dedicated trainer, please check the XTTS docs: https://tts.readthedocs.io/en/dev/models/xtts.html#training"
)
def eval_step(self):
raise NotImplementedError(
"XTTS has a dedicated trainer, please check the XTTS docs: https://tts.readthedocs.io/en/dev/models/xtts.html#training"
)
@staticmethod
def init_from_config(config: "XttsConfig", **kwargs): # pylint: disable=unused-argument
return Xtts(config)
def eval(self): # pylint: disable=redefined-builtin
"""Sets the model to evaluation mode. Overrides the default eval() method to also set the GPT model to eval mode."""
self.gpt.init_gpt_for_inference()
super().eval()
def get_compatible_checkpoint_state_dict(self, model_path):
checkpoint = load_fsspec(model_path, map_location=torch.device("cpu"))["model"]
# remove xtts gpt trainer extra keys
ignore_keys = ["torch_mel_spectrogram_style_encoder", "torch_mel_spectrogram_dvae", "dvae"]
for key in list(checkpoint.keys()):
# check if it is from the coqui Trainer if so convert it
if key.startswith("xtts."):
new_key = key.replace("xtts.", "")
checkpoint[new_key] = checkpoint[key]
del checkpoint[key]
key = new_key
# remove unused keys
if key.split(".")[0] in ignore_keys:
del checkpoint[key]
return checkpoint
def load_checkpoint(
self,
config,
checkpoint_dir=None,
checkpoint_path=None,
vocab_path=None,
eval=True,
strict=True,
use_deepspeed=False,
speaker_file_path=None,
):
"""
Loads a checkpoint from disk and initializes the model's state and tokenizer.
Args:
config (dict): The configuration dictionary for the model.
checkpoint_dir (str, optional): The directory where the checkpoint is stored. Defaults to None.
checkpoint_path (str, optional): The path to the checkpoint file. Defaults to None.
vocab_path (str, optional): The path to the vocabulary file. Defaults to None.
eval (bool, optional): Whether to set the model to evaluation mode. Defaults to True.
strict (bool, optional): Whether to strictly enforce that the keys in the checkpoint match the keys in the model. Defaults to True.
Returns:
None
"""
model_path = checkpoint_path or os.path.join(checkpoint_dir, "model.pth")
vocab_path = vocab_path or os.path.join(checkpoint_dir, "vocab.json")
if speaker_file_path is None and checkpoint_dir is not None:
speaker_file_path = os.path.join(checkpoint_dir, "speakers_xtts.pth")
self.language_manager = LanguageManager(config)
self.speaker_manager = None
if speaker_file_path is not None and os.path.exists(speaker_file_path):
self.speaker_manager = SpeakerManager(speaker_file_path)
if os.path.exists(vocab_path):
self.tokenizer = VoiceBpeTokenizer(vocab_file=vocab_path)
self.init_models()
checkpoint = self.get_compatible_checkpoint_state_dict(model_path)
# deal with v1 and v1.1. V1 has the init_gpt_for_inference keys, v1.1 do not
try:
self.load_state_dict(checkpoint, strict=strict)
except:
if eval:
self.gpt.init_gpt_for_inference(kv_cache=self.args.kv_cache)
self.load_state_dict(checkpoint, strict=strict)
if eval:
self.hifigan_decoder.eval()
self.gpt.init_gpt_for_inference(kv_cache=self.args.kv_cache, use_deepspeed=use_deepspeed)
self.gpt.eval()
def train_step(self):
raise NotImplementedError(
"XTTS has a dedicated trainer, please check the XTTS docs: https://tts.readthedocs.io/en/dev/models/xtts.html#training"
)