From 6af40bc6cf5569c8054a32ff43402f5e4dcfcc4e Mon Sep 17 00:00:00 2001 From: Sam Khoze <68170403+SamKhoze@users.noreply.github.com> Date: Tue, 18 Jun 2024 13:18:37 -0700 Subject: [PATCH] Add files via upload --- TTS/VERSION | 1 + TTS/api.py | 458 +++++++++++++ TTS/bin/__init__.py | 0 TTS/bin/collect_env_info.py | 48 ++ TTS/bin/compute_attention_masks.py | 165 +++++ TTS/bin/compute_embeddings.py | 197 ++++++ TTS/bin/compute_statistics.py | 96 +++ TTS/bin/eval_encoder.py | 88 +++ TTS/bin/extract_tts_spectrograms.py | 287 ++++++++ TTS/bin/find_unique_chars.py | 45 ++ TTS/bin/find_unique_phonemes.py | 74 ++ TTS/bin/remove_silence_using_vad.py | 124 ++++ TTS/bin/resample.py | 90 +++ TTS/bin/synthesize.py | 494 ++++++++++++++ TTS/bin/train_encoder.py | 332 +++++++++ TTS/bin/train_tts.py | 71 ++ TTS/bin/train_vocoder.py | 77 +++ TTS/bin/tune_wavegrad.py | 103 +++ TTS/config/__init__.py | 135 ++++ .../__pycache__/__init__.cpython-311.pyc | Bin 0 -> 6650 bytes .../shared_configs.cpython-311.pyc | Bin 0 -> 11855 bytes TTS/config/shared_configs.py | 268 ++++++++ TTS/encoder/README.md | 18 + TTS/encoder/__init__.py | 0 .../__pycache__/__init__.cpython-311.pyc | Bin 0 -> 173 bytes .../__pycache__/losses.cpython-311.pyc | Bin 0 -> 13636 bytes TTS/encoder/configs/base_encoder_config.py | 61 ++ TTS/encoder/configs/emotion_encoder_config.py | 12 + TTS/encoder/configs/speaker_encoder_config.py | 11 + TTS/encoder/dataset.py | 147 ++++ TTS/encoder/losses.py | 226 +++++++ .../__pycache__/base_encoder.cpython-311.pyc | Bin 0 -> 8120 bytes .../models/__pycache__/lstm.cpython-311.pyc | Bin 0 -> 6648 bytes .../models/__pycache__/resnet.cpython-311.pyc | Bin 0 -> 11976 bytes TTS/encoder/models/base_encoder.py | 161 +++++ TTS/encoder/models/lstm.py | 99 +++ TTS/encoder/models/resnet.py | 198 ++++++ TTS/encoder/requirements.txt | 2 + TTS/encoder/utils/__init__.py | 0 .../__pycache__/__init__.cpython-311.pyc | Bin 0 -> 179 bytes .../__pycache__/generic_utils.cpython-311.pyc | Bin 0 -> 7424 bytes TTS/encoder/utils/generic_utils.py | 136 ++++ TTS/encoder/utils/prepare_voxceleb.py | 219 ++++++ TTS/encoder/utils/training.py | 99 +++ TTS/encoder/utils/visual.py | 50 ++ TTS/model.py | 59 ++ TTS/utils/__init__.py | 0 .../__pycache__/__init__.cpython-311.pyc | Bin 0 -> 171 bytes .../__pycache__/generic_utils.cpython-311.pyc | Bin 0 -> 15661 bytes TTS/utils/__pycache__/io.cpython-311.pyc | Bin 0 -> 4685 bytes TTS/utils/__pycache__/manage.cpython-311.pyc | Bin 0 -> 36630 bytes .../__pycache__/samplers.cpython-311.pyc | Bin 0 -> 12400 bytes .../__pycache__/synthesizer.cpython-311.pyc | Bin 0 -> 24501 bytes TTS/utils/audio/__init__.py | 1 + .../__pycache__/__init__.cpython-311.pyc | Bin 0 -> 255 bytes .../numpy_transforms.cpython-311.pyc | Bin 0 -> 24021 bytes .../__pycache__/processor.cpython-311.pyc | Bin 0 -> 27818 bytes .../torch_transforms.cpython-311.pyc | Bin 0 -> 7009 bytes TTS/utils/audio/numpy_transforms.py | 485 ++++++++++++++ TTS/utils/audio/processor.py | 633 ++++++++++++++++++ TTS/utils/audio/torch_transforms.py | 165 +++++ TTS/utils/callbacks.py | 105 +++ TTS/utils/capacitron_optimizer.py | 67 ++ TTS/utils/distribute.py | 20 + TTS/utils/download.py | 206 ++++++ TTS/utils/downloaders.py | 126 ++++ TTS/utils/generic_utils.py | 239 +++++++ TTS/utils/io.py | 70 ++ TTS/utils/manage.py | 621 +++++++++++++++++ TTS/utils/radam.py | 105 +++ TTS/utils/samplers.py | 201 ++++++ TTS/utils/synthesizer.py | 505 ++++++++++++++ TTS/utils/training.py | 44 ++ TTS/utils/vad.py | 88 +++ 74 files changed, 8332 insertions(+) create mode 100644 TTS/VERSION create mode 100644 TTS/api.py create mode 100644 TTS/bin/__init__.py create mode 100644 TTS/bin/collect_env_info.py create mode 100644 TTS/bin/compute_attention_masks.py create mode 100644 TTS/bin/compute_embeddings.py create mode 100644 TTS/bin/compute_statistics.py create mode 100644 TTS/bin/eval_encoder.py create mode 100644 TTS/bin/extract_tts_spectrograms.py create mode 100644 TTS/bin/find_unique_chars.py create mode 100644 TTS/bin/find_unique_phonemes.py create mode 100644 TTS/bin/remove_silence_using_vad.py create mode 100644 TTS/bin/resample.py create mode 100644 TTS/bin/synthesize.py create mode 100644 TTS/bin/train_encoder.py create mode 100644 TTS/bin/train_tts.py create mode 100644 TTS/bin/train_vocoder.py create mode 100644 TTS/bin/tune_wavegrad.py create mode 100644 TTS/config/__init__.py create mode 100644 TTS/config/__pycache__/__init__.cpython-311.pyc create mode 100644 TTS/config/__pycache__/shared_configs.cpython-311.pyc create mode 100644 TTS/config/shared_configs.py create mode 100644 TTS/encoder/README.md create mode 100644 TTS/encoder/__init__.py create mode 100644 TTS/encoder/__pycache__/__init__.cpython-311.pyc create mode 100644 TTS/encoder/__pycache__/losses.cpython-311.pyc create mode 100644 TTS/encoder/configs/base_encoder_config.py create mode 100644 TTS/encoder/configs/emotion_encoder_config.py create mode 100644 TTS/encoder/configs/speaker_encoder_config.py create mode 100644 TTS/encoder/dataset.py create mode 100644 TTS/encoder/losses.py create mode 100644 TTS/encoder/models/__pycache__/base_encoder.cpython-311.pyc create mode 100644 TTS/encoder/models/__pycache__/lstm.cpython-311.pyc create mode 100644 TTS/encoder/models/__pycache__/resnet.cpython-311.pyc create mode 100644 TTS/encoder/models/base_encoder.py create mode 100644 TTS/encoder/models/lstm.py create mode 100644 TTS/encoder/models/resnet.py create mode 100644 TTS/encoder/requirements.txt create mode 100644 TTS/encoder/utils/__init__.py create mode 100644 TTS/encoder/utils/__pycache__/__init__.cpython-311.pyc create mode 100644 TTS/encoder/utils/__pycache__/generic_utils.cpython-311.pyc create mode 100644 TTS/encoder/utils/generic_utils.py create mode 100644 TTS/encoder/utils/prepare_voxceleb.py create mode 100644 TTS/encoder/utils/training.py create mode 100644 TTS/encoder/utils/visual.py create mode 100644 TTS/model.py create mode 100644 TTS/utils/__init__.py create mode 100644 TTS/utils/__pycache__/__init__.cpython-311.pyc create mode 100644 TTS/utils/__pycache__/generic_utils.cpython-311.pyc create mode 100644 TTS/utils/__pycache__/io.cpython-311.pyc create mode 100644 TTS/utils/__pycache__/manage.cpython-311.pyc create mode 100644 TTS/utils/__pycache__/samplers.cpython-311.pyc create mode 100644 TTS/utils/__pycache__/synthesizer.cpython-311.pyc create mode 100644 TTS/utils/audio/__init__.py create mode 100644 TTS/utils/audio/__pycache__/__init__.cpython-311.pyc create mode 100644 TTS/utils/audio/__pycache__/numpy_transforms.cpython-311.pyc create mode 100644 TTS/utils/audio/__pycache__/processor.cpython-311.pyc create mode 100644 TTS/utils/audio/__pycache__/torch_transforms.cpython-311.pyc create mode 100644 TTS/utils/audio/numpy_transforms.py create mode 100644 TTS/utils/audio/processor.py create mode 100644 TTS/utils/audio/torch_transforms.py create mode 100644 TTS/utils/callbacks.py create mode 100644 TTS/utils/capacitron_optimizer.py create mode 100644 TTS/utils/distribute.py create mode 100644 TTS/utils/download.py create mode 100644 TTS/utils/downloaders.py create mode 100644 TTS/utils/generic_utils.py create mode 100644 TTS/utils/io.py create mode 100644 TTS/utils/manage.py create mode 100644 TTS/utils/radam.py create mode 100644 TTS/utils/samplers.py create mode 100644 TTS/utils/synthesizer.py create mode 100644 TTS/utils/training.py create mode 100644 TTS/utils/vad.py diff --git a/TTS/VERSION b/TTS/VERSION new file mode 100644 index 0000000..2157409 --- /dev/null +++ b/TTS/VERSION @@ -0,0 +1 @@ +0.22.0 diff --git a/TTS/api.py b/TTS/api.py new file mode 100644 index 0000000..7abc188 --- /dev/null +++ b/TTS/api.py @@ -0,0 +1,458 @@ +import tempfile +import warnings +from pathlib import Path +from typing import Union + +import numpy as np +from torch import nn + +from TTS.utils.audio.numpy_transforms import save_wav +from TTS.utils.manage import ModelManager +from TTS.utils.synthesizer import Synthesizer +from TTS.config import load_config + + +class TTS(nn.Module): + """TODO: Add voice conversion and Capacitron support.""" + + def __init__( + self, + model_name: str = "", + model_path: str = None, + config_path: str = None, + vocoder_path: str = None, + vocoder_config_path: str = None, + progress_bar: bool = True, + gpu=False, + ): + """🐸TTS python interface that allows to load and use the released models. + + Example with a multi-speaker model: + >>> from TTS.api import TTS + >>> tts = TTS(TTS.list_models()[0]) + >>> wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0]) + >>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav") + + Example with a single-speaker model: + >>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False) + >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav") + + Example loading a model from a path: + >>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False, gpu=False) + >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav") + + Example voice cloning with YourTTS in English, French and Portuguese: + >>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True) + >>> tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="thisisit.wav") + >>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav") + >>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav") + + Example Fairseq TTS models (uses ISO language codes in https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html): + >>> tts = TTS(model_name="tts_models/eng/fairseq/vits", progress_bar=False, gpu=True) + >>> tts.tts_to_file("This is a test.", file_path="output.wav") + + Args: + model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None. + model_path (str, optional): Path to the model checkpoint. Defaults to None. + config_path (str, optional): Path to the model config. Defaults to None. + vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None. + vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None. + progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True. + gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. + """ + super().__init__() + self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False) + self.config = load_config(config_path) if config_path else None + self.synthesizer = None + self.voice_converter = None + self.model_name = "" + if gpu: + warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.") + + if model_name is not None and len(model_name) > 0: + if "tts_models" in model_name: + self.load_tts_model_by_name(model_name, gpu) + elif "voice_conversion_models" in model_name: + self.load_vc_model_by_name(model_name, gpu) + else: + self.load_model_by_name(model_name, gpu) + + if model_path: + self.load_tts_model_by_path( + model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu + ) + + @property + def models(self): + return self.manager.list_tts_models() + + @property + def is_multi_speaker(self): + if hasattr(self.synthesizer.tts_model, "speaker_manager") and self.synthesizer.tts_model.speaker_manager: + return self.synthesizer.tts_model.speaker_manager.num_speakers > 1 + return False + + @property + def is_multi_lingual(self): + # Not sure what sets this to None, but applied a fix to prevent crashing. + if ( + isinstance(self.model_name, str) + and "xtts" in self.model_name + or self.config + and ("xtts" in self.config.model or len(self.config.languages) > 1) + ): + return True + if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager: + return self.synthesizer.tts_model.language_manager.num_languages > 1 + return False + + @property + def speakers(self): + if not self.is_multi_speaker: + return None + return self.synthesizer.tts_model.speaker_manager.speaker_names + + @property + def languages(self): + if not self.is_multi_lingual: + return None + return self.synthesizer.tts_model.language_manager.language_names + + @staticmethod + def get_models_file_path(): + return Path(__file__).parent / ".models.json" + + def list_models(self): + return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False) + + def download_model_by_name(self, model_name: str): + model_path, config_path, model_item = self.manager.download_model(model_name) + if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)): + # return model directory if there are multiple files + # we assume that the model knows how to load itself + return None, None, None, None, model_path + if model_item.get("default_vocoder") is None: + return model_path, config_path, None, None, None + vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"]) + return model_path, config_path, vocoder_path, vocoder_config_path, None + + def load_model_by_name(self, model_name: str, gpu: bool = False): + """Load one of the 🐸TTS models by name. + + Args: + model_name (str): Model name to load. You can list models by ```tts.models```. + gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. + """ + self.load_tts_model_by_name(model_name, gpu) + + def load_vc_model_by_name(self, model_name: str, gpu: bool = False): + """Load one of the voice conversion models by name. + + Args: + model_name (str): Model name to load. You can list models by ```tts.models```. + gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. + """ + self.model_name = model_name + model_path, config_path, _, _, _ = self.download_model_by_name(model_name) + self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu) + + def load_tts_model_by_name(self, model_name: str, gpu: bool = False): + """Load one of 🐸TTS models by name. + + Args: + model_name (str): Model name to load. You can list models by ```tts.models```. + gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. + + TODO: Add tests + """ + self.synthesizer = None + self.model_name = model_name + + model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name( + model_name + ) + + # init synthesizer + # None values are fetch from the model + self.synthesizer = Synthesizer( + tts_checkpoint=model_path, + tts_config_path=config_path, + tts_speakers_file=None, + tts_languages_file=None, + vocoder_checkpoint=vocoder_path, + vocoder_config=vocoder_config_path, + encoder_checkpoint=None, + encoder_config=None, + model_dir=model_dir, + use_cuda=gpu, + ) + + def load_tts_model_by_path( + self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False + ): + """Load a model from a path. + + Args: + model_path (str): Path to the model checkpoint. + config_path (str): Path to the model config. + vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None. + vocoder_config (str, optional): Path to the vocoder config. Defaults to None. + gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. + """ + + self.synthesizer = Synthesizer( + tts_checkpoint=model_path, + tts_config_path=config_path, + tts_speakers_file=None, + tts_languages_file=None, + vocoder_checkpoint=vocoder_path, + vocoder_config=vocoder_config, + encoder_checkpoint=None, + encoder_config=None, + use_cuda=gpu, + ) + + def _check_arguments( + self, + speaker: str = None, + language: str = None, + speaker_wav: str = None, + emotion: str = None, + speed: float = None, + **kwargs, + ) -> None: + """Check if the arguments are valid for the model.""" + # check for the coqui tts models + if self.is_multi_speaker and (speaker is None and speaker_wav is None): + raise ValueError("Model is multi-speaker but no `speaker` is provided.") + if self.is_multi_lingual and language is None: + raise ValueError("Model is multi-lingual but no `language` is provided.") + if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs: + raise ValueError("Model is not multi-speaker but `speaker` is provided.") + if not self.is_multi_lingual and language is not None: + raise ValueError("Model is not multi-lingual but `language` is provided.") + if not emotion is None and not speed is None: + raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.") + + def tts( + self, + text: str, + speaker: str = None, + language: str = None, + speaker_wav: str = None, + emotion: str = None, + speed: float = None, + split_sentences: bool = True, + **kwargs, + ): + """Convert text to speech. + + Args: + text (str): + Input text to synthesize. + speaker (str, optional): + Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by + `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None. + language (str): Language of the text. If None, the default language of the speaker is used. Language is only + supported by `XTTS` model. + speaker_wav (str, optional): + Path to a reference wav file to use for voice cloning with supporting models like YourTTS. + Defaults to None. + emotion (str, optional): + Emotion to use for 🐸Coqui Studio models. If None, Studio models use "Neutral". Defaults to None. + speed (float, optional): + Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0. + Defaults to None. + split_sentences (bool, optional): + Split text into sentences, synthesize them separately and concatenate the file audio. + Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only + applicable to the 🐸TTS models. Defaults to True. + kwargs (dict, optional): + Additional arguments for the model. + """ + self._check_arguments( + speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs + ) + wav = self.synthesizer.tts( + text=text, + speaker_name=speaker, + language_name=language, + speaker_wav=speaker_wav, + reference_wav=None, + style_wav=None, + style_text=None, + reference_speaker_name=None, + split_sentences=split_sentences, + **kwargs, + ) + return wav + + def tts_to_file( + self, + text: str, + speaker: str = None, + language: str = None, + speaker_wav: str = None, + emotion: str = None, + speed: float = 1.0, + pipe_out=None, + file_path: str = "output.wav", + split_sentences: bool = True, + **kwargs, + ): + """Convert text to speech. + + Args: + text (str): + Input text to synthesize. + speaker (str, optional): + Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by + `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None. + language (str, optional): + Language code for multi-lingual models. You can check whether loaded model is multi-lingual + `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None. + speaker_wav (str, optional): + Path to a reference wav file to use for voice cloning with supporting models like YourTTS. + Defaults to None. + emotion (str, optional): + Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral". + speed (float, optional): + Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None. + pipe_out (BytesIO, optional): + Flag to stdout the generated TTS wav file for shell pipe. + file_path (str, optional): + Output file path. Defaults to "output.wav". + split_sentences (bool, optional): + Split text into sentences, synthesize them separately and concatenate the file audio. + Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only + applicable to the 🐸TTS models. Defaults to True. + kwargs (dict, optional): + Additional arguments for the model. + """ + self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs) + + wav = self.tts( + text=text, + speaker=speaker, + language=language, + speaker_wav=speaker_wav, + split_sentences=split_sentences, + **kwargs, + ) + self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out) + return file_path + + def voice_conversion( + self, + source_wav: str, + target_wav: str, + ): + """Voice conversion with FreeVC. Convert source wav to target speaker. + + Args:`` + source_wav (str): + Path to the source wav file. + target_wav (str):` + Path to the target wav file. + """ + wav = self.voice_converter.voice_conversion(source_wav=source_wav, target_wav=target_wav) + return wav + + def voice_conversion_to_file( + self, + source_wav: str, + target_wav: str, + file_path: str = "output.wav", + ): + """Voice conversion with FreeVC. Convert source wav to target speaker. + + Args: + source_wav (str): + Path to the source wav file. + target_wav (str): + Path to the target wav file. + file_path (str, optional): + Output file path. Defaults to "output.wav". + """ + wav = self.voice_conversion(source_wav=source_wav, target_wav=target_wav) + save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate) + return file_path + + def tts_with_vc( + self, + text: str, + language: str = None, + speaker_wav: str = None, + speaker: str = None, + split_sentences: bool = True, + ): + """Convert text to speech with voice conversion. + + It combines tts with voice conversion to fake voice cloning. + + - Convert text to speech with tts. + - Convert the output wav to target speaker with voice conversion. + + Args: + text (str): + Input text to synthesize. + language (str, optional): + Language code for multi-lingual models. You can check whether loaded model is multi-lingual + `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None. + speaker_wav (str, optional): + Path to a reference wav file to use for voice cloning with supporting models like YourTTS. + Defaults to None. + speaker (str, optional): + Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by + `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None. + split_sentences (bool, optional): + Split text into sentences, synthesize them separately and concatenate the file audio. + Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only + applicable to the 🐸TTS models. Defaults to True. + """ + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: + # Lazy code... save it to a temp file to resample it while reading it for VC + self.tts_to_file( + text=text, speaker=speaker, language=language, file_path=fp.name, split_sentences=split_sentences + ) + if self.voice_converter is None: + self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24") + wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav) + return wav + + def tts_with_vc_to_file( + self, + text: str, + language: str = None, + speaker_wav: str = None, + file_path: str = "output.wav", + speaker: str = None, + split_sentences: bool = True, + ): + """Convert text to speech with voice conversion and save to file. + + Check `tts_with_vc` for more details. + + Args: + text (str): + Input text to synthesize. + language (str, optional): + Language code for multi-lingual models. You can check whether loaded model is multi-lingual + `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None. + speaker_wav (str, optional): + Path to a reference wav file to use for voice cloning with supporting models like YourTTS. + Defaults to None. + file_path (str, optional): + Output file path. Defaults to "output.wav". + speaker (str, optional): + Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by + `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None. + split_sentences (bool, optional): + Split text into sentences, synthesize them separately and concatenate the file audio. + Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only + applicable to the 🐸TTS models. Defaults to True. + """ + wav = self.tts_with_vc( + text=text, language=language, speaker_wav=speaker_wav, speaker=speaker, split_sentences=split_sentences + ) + save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate) diff --git a/TTS/bin/__init__.py b/TTS/bin/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/TTS/bin/collect_env_info.py b/TTS/bin/collect_env_info.py new file mode 100644 index 0000000..662fcd0 --- /dev/null +++ b/TTS/bin/collect_env_info.py @@ -0,0 +1,48 @@ +"""Get detailed info about the working environment.""" +import os +import platform +import sys + +import numpy +import torch + +sys.path += [os.path.abspath(".."), os.path.abspath(".")] +import json + +import TTS + + +def system_info(): + return { + "OS": platform.system(), + "architecture": platform.architecture(), + "version": platform.version(), + "processor": platform.processor(), + "python": platform.python_version(), + } + + +def cuda_info(): + return { + "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())], + "available": torch.cuda.is_available(), + "version": torch.version.cuda, + } + + +def package_info(): + return { + "numpy": numpy.__version__, + "PyTorch_version": torch.__version__, + "PyTorch_debug": torch.version.debug, + "TTS": TTS.__version__, + } + + +def main(): + details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()} + print(json.dumps(details, indent=4, sort_keys=True)) + + +if __name__ == "__main__": + main() diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py new file mode 100644 index 0000000..9ab520b --- /dev/null +++ b/TTS/bin/compute_attention_masks.py @@ -0,0 +1,165 @@ +import argparse +import importlib +import os +from argparse import RawTextHelpFormatter + +import numpy as np +import torch +from torch.utils.data import DataLoader +from tqdm import tqdm + +from TTS.config import load_config +from TTS.tts.datasets.TTSDataset import TTSDataset +from TTS.tts.models import setup_model +from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols +from TTS.utils.audio import AudioProcessor +from TTS.utils.io import load_checkpoint + +if __name__ == "__main__": + # pylint: disable=bad-option-value + parser = argparse.ArgumentParser( + description="""Extract attention masks from trained Tacotron/Tacotron2 models. +These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n""" + """Each attention mask is written to the same path as the input wav file with ".npy" file extension. +(e.g. path/bla.wav (wav file) --> path/bla.npy (attention mask))\n""" + """ +Example run: + CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py + --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth + --config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json + --dataset_metafile metadata.csv + --data_path /root/LJSpeech-1.1/ + --batch_size 32 + --dataset ljspeech + --use_cuda True +""", + formatter_class=RawTextHelpFormatter, + ) + parser.add_argument("--model_path", type=str, required=True, help="Path to Tacotron/Tacotron2 model file ") + parser.add_argument( + "--config_path", + type=str, + required=True, + help="Path to Tacotron/Tacotron2 config file.", + ) + parser.add_argument( + "--dataset", + type=str, + default="", + required=True, + help="Target dataset processor name from TTS.tts.dataset.preprocess.", + ) + + parser.add_argument( + "--dataset_metafile", + type=str, + default="", + required=True, + help="Dataset metafile inclusing file paths with transcripts.", + ) + parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.") + parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.") + + parser.add_argument( + "--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA." + ) + args = parser.parse_args() + + C = load_config(args.config_path) + ap = AudioProcessor(**C.audio) + + # if the vocabulary was passed, replace the default + if "characters" in C.keys(): + symbols, phonemes = make_symbols(**C.characters) + + # load the model + num_chars = len(phonemes) if C.use_phonemes else len(symbols) + # TODO: handle multi-speaker + model = setup_model(C) + model, _ = load_checkpoint(model, args.model_path, args.use_cuda, True) + + # data loader + preprocessor = importlib.import_module("TTS.tts.datasets.formatters") + preprocessor = getattr(preprocessor, args.dataset) + meta_data = preprocessor(args.data_path, args.dataset_metafile) + dataset = TTSDataset( + model.decoder.r, + C.text_cleaner, + compute_linear_spec=False, + ap=ap, + meta_data=meta_data, + characters=C.characters if "characters" in C.keys() else None, + add_blank=C["add_blank"] if "add_blank" in C.keys() else False, + use_phonemes=C.use_phonemes, + phoneme_cache_path=C.phoneme_cache_path, + phoneme_language=C.phoneme_language, + enable_eos_bos=C.enable_eos_bos_chars, + ) + + dataset.sort_and_filter_items(C.get("sort_by_audio_len", default=False)) + loader = DataLoader( + dataset, + batch_size=args.batch_size, + num_workers=4, + collate_fn=dataset.collate_fn, + shuffle=False, + drop_last=False, + ) + + # compute attentions + file_paths = [] + with torch.no_grad(): + for data in tqdm(loader): + # setup input data + text_input = data[0] + text_lengths = data[1] + linear_input = data[3] + mel_input = data[4] + mel_lengths = data[5] + stop_targets = data[6] + item_idxs = data[7] + + # dispatch data to GPU + if args.use_cuda: + text_input = text_input.cuda() + text_lengths = text_lengths.cuda() + mel_input = mel_input.cuda() + mel_lengths = mel_lengths.cuda() + + model_outputs = model.forward(text_input, text_lengths, mel_input) + + alignments = model_outputs["alignments"].detach() + for idx, alignment in enumerate(alignments): + item_idx = item_idxs[idx] + # interpolate if r > 1 + alignment = ( + torch.nn.functional.interpolate( + alignment.transpose(0, 1).unsqueeze(0), + size=None, + scale_factor=model.decoder.r, + mode="nearest", + align_corners=None, + recompute_scale_factor=None, + ) + .squeeze(0) + .transpose(0, 1) + ) + # remove paddings + alignment = alignment[: mel_lengths[idx], : text_lengths[idx]].cpu().numpy() + # set file paths + wav_file_name = os.path.basename(item_idx) + align_file_name = os.path.splitext(wav_file_name)[0] + "_attn.npy" + file_path = item_idx.replace(wav_file_name, align_file_name) + # save output + wav_file_abs_path = os.path.abspath(item_idx) + file_abs_path = os.path.abspath(file_path) + file_paths.append([wav_file_abs_path, file_abs_path]) + np.save(file_path, alignment) + + # ourput metafile + metafile = os.path.join(args.data_path, "metadata_attn_mask.txt") + + with open(metafile, "w", encoding="utf-8") as f: + for p in file_paths: + f.write(f"{p[0]}|{p[1]}\n") + print(f" >> Metafile created: {metafile}") diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py new file mode 100644 index 0000000..5b5a37d --- /dev/null +++ b/TTS/bin/compute_embeddings.py @@ -0,0 +1,197 @@ +import argparse +import os +from argparse import RawTextHelpFormatter + +import torch +from tqdm import tqdm + +from TTS.config import load_config +from TTS.config.shared_configs import BaseDatasetConfig +from TTS.tts.datasets import load_tts_samples +from TTS.tts.utils.managers import save_file +from TTS.tts.utils.speakers import SpeakerManager + + +def compute_embeddings( + model_path, + config_path, + output_path, + old_speakers_file=None, + old_append=False, + config_dataset_path=None, + formatter_name=None, + dataset_name=None, + dataset_path=None, + meta_file_train=None, + meta_file_val=None, + disable_cuda=False, + no_eval=False, +): + use_cuda = torch.cuda.is_available() and not disable_cuda + + if config_dataset_path is not None: + c_dataset = load_config(config_dataset_path) + meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not no_eval) + else: + c_dataset = BaseDatasetConfig() + c_dataset.formatter = formatter_name + c_dataset.dataset_name = dataset_name + c_dataset.path = dataset_path + if meta_file_train is not None: + c_dataset.meta_file_train = meta_file_train + if meta_file_val is not None: + c_dataset.meta_file_val = meta_file_val + meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not no_eval) + + if meta_data_eval is None: + samples = meta_data_train + else: + samples = meta_data_train + meta_data_eval + + encoder_manager = SpeakerManager( + encoder_model_path=model_path, + encoder_config_path=config_path, + d_vectors_file_path=old_speakers_file, + use_cuda=use_cuda, + ) + + class_name_key = encoder_manager.encoder_config.class_name_key + + # compute speaker embeddings + if old_speakers_file is not None and old_append: + speaker_mapping = encoder_manager.embeddings + else: + speaker_mapping = {} + + for fields in tqdm(samples): + class_name = fields[class_name_key] + audio_file = fields["audio_file"] + embedding_key = fields["audio_unique_name"] + + # Only update the speaker name when the embedding is already in the old file. + if embedding_key in speaker_mapping: + speaker_mapping[embedding_key]["name"] = class_name + continue + + if old_speakers_file is not None and embedding_key in encoder_manager.clip_ids: + # get the embedding from the old file + embedd = encoder_manager.get_embedding_by_clip(embedding_key) + else: + # extract the embedding + embedd = encoder_manager.compute_embedding_from_clip(audio_file) + + # create speaker_mapping if target dataset is defined + speaker_mapping[embedding_key] = {} + speaker_mapping[embedding_key]["name"] = class_name + speaker_mapping[embedding_key]["embedding"] = embedd + + if speaker_mapping: + # save speaker_mapping if target dataset is defined + if os.path.isdir(output_path): + mapping_file_path = os.path.join(output_path, "speakers.pth") + else: + mapping_file_path = output_path + + if os.path.dirname(mapping_file_path) != "": + os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True) + + save_file(speaker_mapping, mapping_file_path) + print("Speaker embeddings saved at:", mapping_file_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n""" + """ + Example runs: + python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --config_dataset_path dataset_config.json + + python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv + """, + formatter_class=RawTextHelpFormatter, + ) + parser.add_argument( + "--model_path", + type=str, + help="Path to model checkpoint file. It defaults to the released speaker encoder.", + default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar", + ) + parser.add_argument( + "--config_path", + type=str, + help="Path to model config file. It defaults to the released speaker encoder config.", + default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json", + ) + parser.add_argument( + "--config_dataset_path", + type=str, + help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.", + default=None, + ) + parser.add_argument( + "--output_path", + type=str, + help="Path for output `pth` or `json` file.", + default="speakers.pth", + ) + parser.add_argument( + "--old_file", + type=str, + help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.", + default=None, + ) + parser.add_argument( + "--old_append", + help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False", + default=False, + action="store_true", + ) + parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False) + parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true") + parser.add_argument( + "--formatter_name", + type=str, + help="Name of the formatter to use. You either need to provide this or `config_dataset_path`", + default=None, + ) + parser.add_argument( + "--dataset_name", + type=str, + help="Name of the dataset to use. You either need to provide this or `config_dataset_path`", + default=None, + ) + parser.add_argument( + "--dataset_path", + type=str, + help="Path to the dataset. You either need to provide this or `config_dataset_path`", + default=None, + ) + parser.add_argument( + "--meta_file_train", + type=str, + help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`", + default=None, + ) + parser.add_argument( + "--meta_file_val", + type=str, + help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`", + default=None, + ) + args = parser.parse_args() + + compute_embeddings( + args.model_path, + args.config_path, + args.output_path, + old_speakers_file=args.old_file, + old_append=args.old_append, + config_dataset_path=args.config_dataset_path, + formatter_name=args.formatter_name, + dataset_name=args.dataset_name, + dataset_path=args.dataset_path, + meta_file_train=args.meta_file_train, + meta_file_val=args.meta_file_val, + disable_cuda=args.disable_cuda, + no_eval=args.no_eval, + ) diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py new file mode 100644 index 0000000..3ab7ea7 --- /dev/null +++ b/TTS/bin/compute_statistics.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import argparse +import glob +import os + +import numpy as np +from tqdm import tqdm + +# from TTS.utils.io import load_config +from TTS.config import load_config +from TTS.tts.datasets import load_tts_samples +from TTS.utils.audio import AudioProcessor + + +def main(): + """Run preprocessing process.""" + parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.") + parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.") + parser.add_argument("out_path", type=str, help="save path (directory and filename).") + parser.add_argument( + "--data_path", + type=str, + required=False, + help="folder including the target set of wavs overriding dataset config.", + ) + args, overrides = parser.parse_known_args() + + CONFIG = load_config(args.config_path) + CONFIG.parse_known_args(overrides, relaxed_parser=True) + + # load config + CONFIG.audio.signal_norm = False # do not apply earlier normalization + CONFIG.audio.stats_path = None # discard pre-defined stats + + # load audio processor + ap = AudioProcessor(**CONFIG.audio.to_dict()) + + # load the meta data of target dataset + if args.data_path: + dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True) + else: + dataset_items = load_tts_samples(CONFIG.datasets)[0] # take only train data + print(f" > There are {len(dataset_items)} files.") + + mel_sum = 0 + mel_square_sum = 0 + linear_sum = 0 + linear_square_sum = 0 + N = 0 + for item in tqdm(dataset_items): + # compute features + wav = ap.load_wav(item if isinstance(item, str) else item["audio_file"]) + linear = ap.spectrogram(wav) + mel = ap.melspectrogram(wav) + + # compute stats + N += mel.shape[1] + mel_sum += mel.sum(1) + linear_sum += linear.sum(1) + mel_square_sum += (mel**2).sum(axis=1) + linear_square_sum += (linear**2).sum(axis=1) + + mel_mean = mel_sum / N + mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2) + linear_mean = linear_sum / N + linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2) + + output_file_path = args.out_path + stats = {} + stats["mel_mean"] = mel_mean + stats["mel_std"] = mel_scale + stats["linear_mean"] = linear_mean + stats["linear_std"] = linear_scale + + print(f" > Avg mel spec mean: {mel_mean.mean()}") + print(f" > Avg mel spec scale: {mel_scale.mean()}") + print(f" > Avg linear spec mean: {linear_mean.mean()}") + print(f" > Avg linear spec scale: {linear_scale.mean()}") + + # set default config values for mean-var scaling + CONFIG.audio.stats_path = output_file_path + CONFIG.audio.signal_norm = True + # remove redundant values + del CONFIG.audio.max_norm + del CONFIG.audio.min_level_db + del CONFIG.audio.symmetric_norm + del CONFIG.audio.clip_norm + stats["audio_config"] = CONFIG.audio.to_dict() + np.save(output_file_path, stats, allow_pickle=True) + print(f" > stats saved to {output_file_path}") + + +if __name__ == "__main__": + main() diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py new file mode 100644 index 0000000..60fed13 --- /dev/null +++ b/TTS/bin/eval_encoder.py @@ -0,0 +1,88 @@ +import argparse +from argparse import RawTextHelpFormatter + +import torch +from tqdm import tqdm + +from TTS.config import load_config +from TTS.tts.datasets import load_tts_samples +from TTS.tts.utils.speakers import SpeakerManager + + +def compute_encoder_accuracy(dataset_items, encoder_manager): + class_name_key = encoder_manager.encoder_config.class_name_key + map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None) + + class_acc_dict = {} + + # compute embeddings for all wav_files + for item in tqdm(dataset_items): + class_name = item[class_name_key] + wav_file = item["audio_file"] + + # extract the embedding + embedd = encoder_manager.compute_embedding_from_clip(wav_file) + if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None: + embedding = torch.FloatTensor(embedd).unsqueeze(0) + if encoder_manager.use_cuda: + embedding = embedding.cuda() + + class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item() + predicted_label = map_classid_to_classname[str(class_id)] + else: + predicted_label = None + + if class_name is not None and predicted_label is not None: + is_equal = int(class_name == predicted_label) + if class_name not in class_acc_dict: + class_acc_dict[class_name] = [is_equal] + else: + class_acc_dict[class_name].append(is_equal) + else: + raise RuntimeError("Error: class_name or/and predicted_label are None") + + acc_avg = 0 + for key, values in class_acc_dict.items(): + acc = sum(values) / len(values) + print("Class", key, "Accuracy:", acc) + acc_avg += acc + + print("Average Accuracy:", acc_avg / len(class_acc_dict)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="""Compute the accuracy of the encoder.\n\n""" + """ + Example runs: + python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json dataset_config.json + """, + formatter_class=RawTextHelpFormatter, + ) + parser.add_argument("model_path", type=str, help="Path to model checkpoint file.") + parser.add_argument( + "config_path", + type=str, + help="Path to model config file.", + ) + + parser.add_argument( + "config_dataset_path", + type=str, + help="Path to dataset config file.", + ) + parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True) + parser.add_argument("--eval", type=bool, help="compute eval.", default=True) + + args = parser.parse_args() + + c_dataset = load_config(args.config_dataset_path) + + meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval) + items = meta_data_train + meta_data_eval + + enc_manager = SpeakerManager( + encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda + ) + + compute_encoder_accuracy(items, enc_manager) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py new file mode 100644 index 0000000..c604862 --- /dev/null +++ b/TTS/bin/extract_tts_spectrograms.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python3 +"""Extract Mel spectrograms with teacher forcing.""" + +import argparse +import os + +import numpy as np +import torch +from torch.utils.data import DataLoader +from tqdm import tqdm + +from TTS.config import load_config +from TTS.tts.datasets import TTSDataset, load_tts_samples +from TTS.tts.models import setup_model +from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer +from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.numpy_transforms import quantize +from TTS.utils.generic_utils import count_parameters + +use_cuda = torch.cuda.is_available() + + +def setup_loader(ap, r, verbose=False): + tokenizer, _ = TTSTokenizer.init_from_config(c) + dataset = TTSDataset( + outputs_per_step=r, + compute_linear_spec=False, + samples=meta_data, + tokenizer=tokenizer, + ap=ap, + batch_group_size=0, + min_text_len=c.min_text_len, + max_text_len=c.max_text_len, + min_audio_len=c.min_audio_len, + max_audio_len=c.max_audio_len, + phoneme_cache_path=c.phoneme_cache_path, + precompute_num_workers=0, + use_noise_augment=False, + verbose=verbose, + speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None, + d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None, + ) + + if c.use_phonemes and c.compute_input_seq_cache: + # precompute phonemes to have a better estimate of sequence lengths. + dataset.compute_input_seq(c.num_loader_workers) + dataset.preprocess_samples() + + loader = DataLoader( + dataset, + batch_size=c.batch_size, + shuffle=False, + collate_fn=dataset.collate_fn, + drop_last=False, + sampler=None, + num_workers=c.num_loader_workers, + pin_memory=False, + ) + return loader + + +def set_filename(wav_path, out_path): + wav_file = os.path.basename(wav_path) + file_name = wav_file.split(".")[0] + os.makedirs(os.path.join(out_path, "quant"), exist_ok=True) + os.makedirs(os.path.join(out_path, "mel"), exist_ok=True) + os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True) + os.makedirs(os.path.join(out_path, "wav"), exist_ok=True) + wavq_path = os.path.join(out_path, "quant", file_name) + mel_path = os.path.join(out_path, "mel", file_name) + wav_gl_path = os.path.join(out_path, "wav_gl", file_name + ".wav") + wav_path = os.path.join(out_path, "wav", file_name + ".wav") + return file_name, wavq_path, mel_path, wav_gl_path, wav_path + + +def format_data(data): + # setup input data + text_input = data["token_id"] + text_lengths = data["token_id_lengths"] + mel_input = data["mel"] + mel_lengths = data["mel_lengths"] + item_idx = data["item_idxs"] + d_vectors = data["d_vectors"] + speaker_ids = data["speaker_ids"] + attn_mask = data["attns"] + avg_text_length = torch.mean(text_lengths.float()) + avg_spec_length = torch.mean(mel_lengths.float()) + + # dispatch data to GPU + if use_cuda: + text_input = text_input.cuda(non_blocking=True) + text_lengths = text_lengths.cuda(non_blocking=True) + mel_input = mel_input.cuda(non_blocking=True) + mel_lengths = mel_lengths.cuda(non_blocking=True) + if speaker_ids is not None: + speaker_ids = speaker_ids.cuda(non_blocking=True) + if d_vectors is not None: + d_vectors = d_vectors.cuda(non_blocking=True) + if attn_mask is not None: + attn_mask = attn_mask.cuda(non_blocking=True) + return ( + text_input, + text_lengths, + mel_input, + mel_lengths, + speaker_ids, + d_vectors, + avg_text_length, + avg_spec_length, + attn_mask, + item_idx, + ) + + +@torch.no_grad() +def inference( + model_name, + model, + ap, + text_input, + text_lengths, + mel_input, + mel_lengths, + speaker_ids=None, + d_vectors=None, +): + if model_name == "glow_tts": + speaker_c = None + if speaker_ids is not None: + speaker_c = speaker_ids + elif d_vectors is not None: + speaker_c = d_vectors + outputs = model.inference_with_MAS( + text_input, + text_lengths, + mel_input, + mel_lengths, + aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids}, + ) + model_output = outputs["model_outputs"] + model_output = model_output.detach().cpu().numpy() + + elif "tacotron" in model_name: + aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} + outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input) + postnet_outputs = outputs["model_outputs"] + # normalize tacotron output + if model_name == "tacotron": + mel_specs = [] + postnet_outputs = postnet_outputs.data.cpu().numpy() + for b in range(postnet_outputs.shape[0]): + postnet_output = postnet_outputs[b] + mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T)) + model_output = torch.stack(mel_specs).cpu().numpy() + + elif model_name == "tacotron2": + model_output = postnet_outputs.detach().cpu().numpy() + return model_output + + +def extract_spectrograms( + data_loader, model, ap, output_path, quantize_bits=0, save_audio=False, debug=False, metada_name="metada.txt" +): + model.eval() + export_metadata = [] + for _, data in tqdm(enumerate(data_loader), total=len(data_loader)): + # format data + ( + text_input, + text_lengths, + mel_input, + mel_lengths, + speaker_ids, + d_vectors, + _, + _, + _, + item_idx, + ) = format_data(data) + + model_output = inference( + c.model.lower(), + model, + ap, + text_input, + text_lengths, + mel_input, + mel_lengths, + speaker_ids, + d_vectors, + ) + + for idx in range(text_input.shape[0]): + wav_file_path = item_idx[idx] + wav = ap.load_wav(wav_file_path) + _, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path) + + # quantize and save wav + if quantize_bits > 0: + wavq = quantize(wav, quantize_bits) + np.save(wavq_path, wavq) + + # save TTS mel + mel = model_output[idx] + mel_length = mel_lengths[idx] + mel = mel[:mel_length, :].T + np.save(mel_path, mel) + + export_metadata.append([wav_file_path, mel_path]) + if save_audio: + ap.save_wav(wav, wav_path) + + if debug: + print("Audio for debug saved at:", wav_gl_path) + wav = ap.inv_melspectrogram(mel) + ap.save_wav(wav, wav_gl_path) + + with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f: + for data in export_metadata: + f.write(f"{data[0]}|{data[1]+'.npy'}\n") + + +def main(args): # pylint: disable=redefined-outer-name + # pylint: disable=global-variable-undefined + global meta_data, speaker_manager + + # Audio processor + ap = AudioProcessor(**c.audio) + + # load data instances + meta_data_train, meta_data_eval = load_tts_samples( + c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size + ) + + # use eval and training partitions + meta_data = meta_data_train + meta_data_eval + + # init speaker manager + if c.use_speaker_embedding: + speaker_manager = SpeakerManager(data_items=meta_data) + elif c.use_d_vector_file: + speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file) + else: + speaker_manager = None + + # setup model + model = setup_model(c) + + # restore model + model.load_checkpoint(c, args.checkpoint_path, eval=True) + + if use_cuda: + model.cuda() + + num_params = count_parameters(model) + print("\n > Model has {} parameters".format(num_params), flush=True) + # set r + r = 1 if c.model.lower() == "glow_tts" else model.decoder.r + own_loader = setup_loader(ap, r, verbose=True) + + extract_spectrograms( + own_loader, + model, + ap, + args.output_path, + quantize_bits=args.quantize_bits, + save_audio=args.save_audio, + debug=args.debug, + metada_name="metada.txt", + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True) + parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True) + parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True) + parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug") + parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files") + parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero") + parser.add_argument("--eval", type=bool, help="compute eval.", default=True) + args = parser.parse_args() + + c = load_config(args.config_path) + c.audio.trim_silence = False + main(args) diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py new file mode 100644 index 0000000..ea16974 --- /dev/null +++ b/TTS/bin/find_unique_chars.py @@ -0,0 +1,45 @@ +"""Find all the unique characters in a dataset""" +import argparse +from argparse import RawTextHelpFormatter + +from TTS.config import load_config +from TTS.tts.datasets import load_tts_samples + + +def main(): + # pylint: disable=bad-option-value + parser = argparse.ArgumentParser( + description="""Find all the unique characters or phonemes in a dataset.\n\n""" + """ + Example runs: + + python TTS/bin/find_unique_chars.py --config_path config.json + """, + formatter_class=RawTextHelpFormatter, + ) + parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True) + args = parser.parse_args() + + c = load_config(args.config_path) + + # load all datasets + train_items, eval_items = load_tts_samples( + c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size + ) + + items = train_items + eval_items + + texts = "".join(item["text"] for item in items) + chars = set(texts) + lower_chars = filter(lambda c: c.islower(), chars) + chars_force_lower = [c.lower() for c in chars] + chars_force_lower = set(chars_force_lower) + + print(f" > Number of unique characters: {len(chars)}") + print(f" > Unique characters: {''.join(sorted(chars))}") + print(f" > Unique lower characters: {''.join(sorted(lower_chars))}") + print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}") + + +if __name__ == "__main__": + main() diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py new file mode 100644 index 0000000..4bd7a78 --- /dev/null +++ b/TTS/bin/find_unique_phonemes.py @@ -0,0 +1,74 @@ +"""Find all the unique characters in a dataset""" +import argparse +import multiprocessing +from argparse import RawTextHelpFormatter + +from tqdm.contrib.concurrent import process_map + +from TTS.config import load_config +from TTS.tts.datasets import load_tts_samples +from TTS.tts.utils.text.phonemizers import Gruut + + +def compute_phonemes(item): + text = item["text"] + ph = phonemizer.phonemize(text).replace("|", "") + return set(list(ph)) + + +def main(): + # pylint: disable=W0601 + global c, phonemizer + # pylint: disable=bad-option-value + parser = argparse.ArgumentParser( + description="""Find all the unique characters or phonemes in a dataset.\n\n""" + """ + Example runs: + + python TTS/bin/find_unique_phonemes.py --config_path config.json + """, + formatter_class=RawTextHelpFormatter, + ) + parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True) + args = parser.parse_args() + + c = load_config(args.config_path) + + # load all datasets + train_items, eval_items = load_tts_samples( + c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size + ) + items = train_items + eval_items + print("Num items:", len(items)) + + language_list = [item["language"] for item in items] + is_lang_def = all(language_list) + + if not c.phoneme_language or not is_lang_def: + raise ValueError("Phoneme language must be defined in config.") + + if not language_list.count(language_list[0]) == len(language_list): + raise ValueError( + "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!" + ) + + phonemizer = Gruut(language=language_list[0], keep_puncs=True) + + phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15) + phones = [] + for ph in phonemes: + phones.extend(ph) + + phones = set(phones) + lower_phones = filter(lambda c: c.islower(), phones) + phones_force_lower = [c.lower() for c in phones] + phones_force_lower = set(phones_force_lower) + + print(f" > Number of unique phonemes: {len(phones)}") + print(f" > Unique phonemes: {''.join(sorted(phones))}") + print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}") + print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}") + + +if __name__ == "__main__": + main() diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py new file mode 100644 index 0000000..a1eaf4c --- /dev/null +++ b/TTS/bin/remove_silence_using_vad.py @@ -0,0 +1,124 @@ +import argparse +import glob +import multiprocessing +import os +import pathlib + +import torch +from tqdm import tqdm + +from TTS.utils.vad import get_vad_model_and_utils, remove_silence + +torch.set_num_threads(1) + + +def adjust_path_and_remove_silence(audio_path): + output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, "")) + # ignore if the file exists + if os.path.exists(output_path) and not args.force: + return output_path, False + + # create all directory structure + pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True) + # remove the silence and save the audio + output_path, is_speech = remove_silence( + model_and_utils, + audio_path, + output_path, + trim_just_beginning_and_end=args.trim_just_beginning_and_end, + use_cuda=args.use_cuda, + ) + return output_path, is_speech + + +def preprocess_audios(): + files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True)) + print("> Number of files: ", len(files)) + if not args.force: + print("> Ignoring files that already exist in the output idrectory.") + + if args.trim_just_beginning_and_end: + print("> Trimming just the beginning and the end with nonspeech parts.") + else: + print("> Trimming all nonspeech parts.") + + filtered_files = [] + if files: + # create threads + # num_threads = multiprocessing.cpu_count() + # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15) + + if args.num_processes > 1: + with multiprocessing.Pool(processes=args.num_processes) as pool: + results = list( + tqdm( + pool.imap_unordered(adjust_path_and_remove_silence, files), + total=len(files), + desc="Processing audio files", + ) + ) + for output_path, is_speech in results: + if not is_speech: + filtered_files.append(output_path) + else: + for f in tqdm(files): + output_path, is_speech = adjust_path_and_remove_silence(f) + if not is_speech: + filtered_files.append(output_path) + + # write files that do not have speech + with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f: + for file in filtered_files: + f.write(str(file) + "\n") + else: + print("> No files Found !") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True" + ) + parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True) + parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="") + parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files") + parser.add_argument( + "-g", + "--glob", + type=str, + default="**/*.wav", + help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav", + ) + parser.add_argument( + "-t", + "--trim_just_beginning_and_end", + type=bool, + default=True, + help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True", + ) + parser.add_argument( + "-c", + "--use_cuda", + type=bool, + default=False, + help="If True use cuda", + ) + parser.add_argument( + "--use_onnx", + type=bool, + default=False, + help="If True use onnx", + ) + parser.add_argument( + "--num_processes", + type=int, + default=1, + help="Number of processes to use", + ) + args = parser.parse_args() + + if args.output_dir == "": + args.output_dir = args.input_dir + + # load the model and utils + model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda, use_onnx=args.use_onnx) + preprocess_audios() diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py new file mode 100644 index 0000000..a3f2848 --- /dev/null +++ b/TTS/bin/resample.py @@ -0,0 +1,90 @@ +import argparse +import glob +import os +from argparse import RawTextHelpFormatter +from multiprocessing import Pool +from shutil import copytree + +import librosa +import soundfile as sf +from tqdm import tqdm + + +def resample_file(func_args): + filename, output_sr = func_args + y, sr = librosa.load(filename, sr=output_sr) + sf.write(filename, y, sr) + + +def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs=10): + if output_dir: + print("Recursively copying the input folder...") + copytree(input_dir, output_dir) + input_dir = output_dir + + print("Resampling the audio files...") + audio_files = glob.glob(os.path.join(input_dir, f"**/*.{file_ext}"), recursive=True) + print(f"Found {len(audio_files)} files...") + audio_files = list(zip(audio_files, len(audio_files) * [output_sr])) + with Pool(processes=n_jobs) as p: + with tqdm(total=len(audio_files)) as pbar: + for _, _ in enumerate(p.imap_unordered(resample_file, audio_files)): + pbar.update() + + print("Done !") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="""Resample a folder recusively with librosa + Can be used in place or create a copy of the folder as an output.\n\n + Example run: + python TTS/bin/resample.py + --input_dir /root/LJSpeech-1.1/ + --output_sr 22050 + --output_dir /root/resampled_LJSpeech-1.1/ + --file_ext wav + --n_jobs 24 + """, + formatter_class=RawTextHelpFormatter, + ) + + parser.add_argument( + "--input_dir", + type=str, + default=None, + required=True, + help="Path of the folder containing the audio files to resample", + ) + + parser.add_argument( + "--output_sr", + type=int, + default=22050, + required=False, + help="Samlple rate to which the audio files should be resampled", + ) + + parser.add_argument( + "--output_dir", + type=str, + default=None, + required=False, + help="Path of the destination folder. If not defined, the operation is done in place", + ) + + parser.add_argument( + "--file_ext", + type=str, + default="wav", + required=False, + help="Extension of the audio files to resample", + ) + + parser.add_argument( + "--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores" + ) + + args = parser.parse_args() + + resample_files(args.input_dir, args.output_sr, args.output_dir, args.file_ext, args.n_jobs) diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py new file mode 100644 index 0000000..b86252a --- /dev/null +++ b/TTS/bin/synthesize.py @@ -0,0 +1,494 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import argparse +import contextlib +import sys +from argparse import RawTextHelpFormatter + +# pylint: disable=redefined-outer-name, unused-argument +from pathlib import Path + +description = """ +Synthesize speech on command line. + +You can either use your trained model or choose a model from the provided list. + +If you don't specify any models, then it uses LJSpeech based English model. + +#### Single Speaker Models + +- List provided models: + + ``` + $ tts --list_models + ``` + +- Get model info (for both tts_models and vocoder_models): + + - Query by type/name: + The model_info_by_name uses the name as it from the --list_models. + ``` + $ tts --model_info_by_name "///" + ``` + For example: + ``` + $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts + $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2 + ``` + - Query by type/idx: + The model_query_idx uses the corresponding idx from --list_models. + + ``` + $ tts --model_info_by_idx "/" + ``` + + For example: + + ``` + $ tts --model_info_by_idx tts_models/3 + ``` + + - Query info for model info by full name: + ``` + $ tts --model_info_by_name "///" + ``` + +- Run TTS with default models: + + ``` + $ tts --text "Text for TTS" --out_path output/path/speech.wav + ``` + +- Run TTS and pipe out the generated TTS wav file data: + + ``` + $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay + ``` + +- Run a TTS model with its default vocoder model: + + ``` + $ tts --text "Text for TTS" --model_name "///" --out_path output/path/speech.wav + ``` + + For example: + + ``` + $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav + ``` + +- Run with specific TTS and vocoder models from the list: + + ``` + $ tts --text "Text for TTS" --model_name "///" --vocoder_name "///" --out_path output/path/speech.wav + ``` + + For example: + + ``` + $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav + ``` + +- Run your own TTS model (Using Griffin-Lim Vocoder): + + ``` + $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav + ``` + +- Run your own TTS and Vocoder models: + + ``` + $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav + --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json + ``` + +#### Multi-speaker Models + +- List the available speakers and choose a among them: + + ``` + $ tts --model_name "//" --list_speaker_idxs + ``` + +- Run the multi-speaker TTS model with the target speaker ID: + + ``` + $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "//" --speaker_idx + ``` + +- Run your own multi-speaker TTS model: + + ``` + $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx + ``` + +### Voice Conversion Models + +``` +$ tts --out_path output/path/speech.wav --model_name "//" --source_wav --target_wav +``` +""" + + +def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + if v.lower() in ("no", "false", "f", "n", "0"): + return False + raise argparse.ArgumentTypeError("Boolean value expected.") + + +def main(): + parser = argparse.ArgumentParser( + description=description.replace(" ```\n", ""), + formatter_class=RawTextHelpFormatter, + ) + + parser.add_argument( + "--list_models", + type=str2bool, + nargs="?", + const=True, + default=False, + help="list available pre-trained TTS and vocoder models.", + ) + + parser.add_argument( + "--model_info_by_idx", + type=str, + default=None, + help="model info using query format: /", + ) + + parser.add_argument( + "--model_info_by_name", + type=str, + default=None, + help="model info using query format: ///", + ) + + parser.add_argument("--text", type=str, default=None, help="Text to generate speech.") + + # Args for running pre-trained TTS models. + parser.add_argument( + "--model_name", + type=str, + default="tts_models/en/ljspeech/tacotron2-DDC", + help="Name of one of the pre-trained TTS models in format //", + ) + parser.add_argument( + "--vocoder_name", + type=str, + default=None, + help="Name of one of the pre-trained vocoder models in format //", + ) + + # Args for running custom models + parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.") + parser.add_argument( + "--model_path", + type=str, + default=None, + help="Path to model file.", + ) + parser.add_argument( + "--out_path", + type=str, + default="tts_output.wav", + help="Output wav file path.", + ) + parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False) + parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu") + parser.add_argument( + "--vocoder_path", + type=str, + help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).", + default=None, + ) + parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None) + parser.add_argument( + "--encoder_path", + type=str, + help="Path to speaker encoder model file.", + default=None, + ) + parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None) + parser.add_argument( + "--pipe_out", + help="stdout the generated TTS wav file for shell pipe.", + type=str2bool, + nargs="?", + const=True, + default=False, + ) + + # args for multi-speaker synthesis + parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) + parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None) + parser.add_argument( + "--speaker_idx", + type=str, + help="Target speaker ID for a multi-speaker TTS model.", + default=None, + ) + parser.add_argument( + "--language_idx", + type=str, + help="Target language ID for a multi-lingual TTS model.", + default=None, + ) + parser.add_argument( + "--speaker_wav", + nargs="+", + help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.", + default=None, + ) + parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None) + parser.add_argument( + "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None + ) + parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None) + parser.add_argument( + "--list_speaker_idxs", + help="List available speaker ids for the defined multi-speaker model.", + type=str2bool, + nargs="?", + const=True, + default=False, + ) + parser.add_argument( + "--list_language_idxs", + help="List available language ids for the defined multi-lingual model.", + type=str2bool, + nargs="?", + const=True, + default=False, + ) + # aux args + parser.add_argument( + "--save_spectogram", + type=bool, + help="If true save raw spectogram for further (vocoder) processing in out_path.", + default=False, + ) + parser.add_argument( + "--reference_wav", + type=str, + help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav", + default=None, + ) + parser.add_argument( + "--reference_speaker_idx", + type=str, + help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).", + default=None, + ) + parser.add_argument( + "--progress_bar", + type=str2bool, + help="If true shows a progress bar for the model download. Defaults to True", + default=True, + ) + + # voice conversion args + parser.add_argument( + "--source_wav", + type=str, + default=None, + help="Original audio file to convert in the voice of the target_wav", + ) + parser.add_argument( + "--target_wav", + type=str, + default=None, + help="Target audio file to convert in the voice of the source_wav", + ) + + parser.add_argument( + "--voice_dir", + type=str, + default=None, + help="Voice dir for tortoise model", + ) + + args = parser.parse_args() + + # print the description if either text or list_models is not set + check_args = [ + args.text, + args.list_models, + args.list_speaker_idxs, + args.list_language_idxs, + args.reference_wav, + args.model_info_by_idx, + args.model_info_by_name, + args.source_wav, + args.target_wav, + ] + if not any(check_args): + parser.parse_args(["-h"]) + + pipe_out = sys.stdout if args.pipe_out else None + + with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout): + # Late-import to make things load faster + from TTS.api import TTS + from TTS.utils.manage import ModelManager + from TTS.utils.synthesizer import Synthesizer + + # load model manager + path = Path(__file__).parent / "../.models.json" + manager = ModelManager(path, progress_bar=args.progress_bar) + api = TTS() + + tts_path = None + tts_config_path = None + speakers_file_path = None + language_ids_file_path = None + vocoder_path = None + vocoder_config_path = None + encoder_path = None + encoder_config_path = None + vc_path = None + vc_config_path = None + model_dir = None + + # CASE1 #list : list pre-trained TTS models + if args.list_models: + manager.list_models() + sys.exit() + + # CASE2 #info : model info for pre-trained TTS models + if args.model_info_by_idx: + model_query = args.model_info_by_idx + manager.model_info_by_idx(model_query) + sys.exit() + + if args.model_info_by_name: + model_query_full_name = args.model_info_by_name + manager.model_info_by_full_name(model_query_full_name) + sys.exit() + + # CASE3: load pre-trained model paths + if args.model_name is not None and not args.model_path: + model_path, config_path, model_item = manager.download_model(args.model_name) + # tts model + if model_item["model_type"] == "tts_models": + tts_path = model_path + tts_config_path = config_path + if "default_vocoder" in model_item: + args.vocoder_name = ( + model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name + ) + + # voice conversion model + if model_item["model_type"] == "voice_conversion_models": + vc_path = model_path + vc_config_path = config_path + + # tts model with multiple files to be loaded from the directory path + if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list): + model_dir = model_path + tts_path = None + tts_config_path = None + args.vocoder_name = None + + # load vocoder + if args.vocoder_name is not None and not args.vocoder_path: + vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) + + # CASE4: set custom model paths + if args.model_path is not None: + tts_path = args.model_path + tts_config_path = args.config_path + speakers_file_path = args.speakers_file_path + language_ids_file_path = args.language_ids_file_path + + if args.vocoder_path is not None: + vocoder_path = args.vocoder_path + vocoder_config_path = args.vocoder_config_path + + if args.encoder_path is not None: + encoder_path = args.encoder_path + encoder_config_path = args.encoder_config_path + + device = args.device + if args.use_cuda: + device = "cuda" + + # load models + synthesizer = Synthesizer( + tts_path, + tts_config_path, + speakers_file_path, + language_ids_file_path, + vocoder_path, + vocoder_config_path, + encoder_path, + encoder_config_path, + vc_path, + vc_config_path, + model_dir, + args.voice_dir, + ).to(device) + + # query speaker ids of a multi-speaker model. + if args.list_speaker_idxs: + print( + " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." + ) + print(synthesizer.tts_model.speaker_manager.name_to_id) + return + + # query langauge ids of a multi-lingual model. + if args.list_language_idxs: + print( + " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model." + ) + print(synthesizer.tts_model.language_manager.name_to_id) + return + + # check the arguments against a multi-speaker model. + if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav): + print( + " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to " + "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`." + ) + return + + # RUN THE SYNTHESIS + if args.text: + print(" > Text: {}".format(args.text)) + + # kick it + if tts_path is not None: + wav = synthesizer.tts( + args.text, + speaker_name=args.speaker_idx, + language_name=args.language_idx, + speaker_wav=args.speaker_wav, + reference_wav=args.reference_wav, + style_wav=args.capacitron_style_wav, + style_text=args.capacitron_style_text, + reference_speaker_name=args.reference_speaker_idx, + ) + elif vc_path is not None: + wav = synthesizer.voice_conversion( + source_wav=args.source_wav, + target_wav=args.target_wav, + ) + elif model_dir is not None: + wav = synthesizer.tts( + args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav + ) + + # save the results + print(" > Saving output to {}".format(args.out_path)) + synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out) + + +if __name__ == "__main__": + main() diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py new file mode 100644 index 0000000..a32ad00 --- /dev/null +++ b/TTS/bin/train_encoder.py @@ -0,0 +1,332 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import sys +import time +import traceback + +import torch +from torch.utils.data import DataLoader +from trainer.io import copy_model_files, save_best_model, save_checkpoint +from trainer.torch import NoamLR +from trainer.trainer_utils import get_optimizer + +from TTS.encoder.dataset import EncoderDataset +from TTS.encoder.utils.generic_utils import setup_encoder_model +from TTS.encoder.utils.training import init_training +from TTS.encoder.utils.visual import plot_embeddings +from TTS.tts.datasets import load_tts_samples +from TTS.utils.audio import AudioProcessor +from TTS.utils.generic_utils import count_parameters, remove_experiment_folder +from TTS.utils.samplers import PerfectBatchSampler +from TTS.utils.training import check_update + +torch.backends.cudnn.enabled = True +torch.backends.cudnn.benchmark = True +torch.manual_seed(54321) +use_cuda = torch.cuda.is_available() +num_gpus = torch.cuda.device_count() +print(" > Using CUDA: ", use_cuda) +print(" > Number of GPUs: ", num_gpus) + + +def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False): + num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class + num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch + + dataset = EncoderDataset( + c, + ap, + meta_data_eval if is_val else meta_data_train, + voice_len=c.voice_len, + num_utter_per_class=num_utter_per_class, + num_classes_in_batch=num_classes_in_batch, + verbose=verbose, + augmentation_config=c.audio_augmentation if not is_val else None, + use_torch_spec=c.model_params.get("use_torch_spec", False), + ) + # get classes list + classes = dataset.get_class_list() + + sampler = PerfectBatchSampler( + dataset.items, + classes, + batch_size=num_classes_in_batch * num_utter_per_class, # total batch size + num_classes_in_batch=num_classes_in_batch, + num_gpus=1, + shuffle=not is_val, + drop_last=True, + ) + + if len(classes) < num_classes_in_batch: + if is_val: + raise RuntimeError( + f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !" + ) + raise RuntimeError( + f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !" + ) + + # set the classes to avoid get wrong class_id when the number of training and eval classes are not equal + if is_val: + dataset.set_classes(train_classes) + + loader = DataLoader( + dataset, + num_workers=c.num_loader_workers, + batch_sampler=sampler, + collate_fn=dataset.collate_fn, + ) + + return loader, classes, dataset.get_map_classid_to_classname() + + +def evaluation(model, criterion, data_loader, global_step): + eval_loss = 0 + for _, data in enumerate(data_loader): + with torch.no_grad(): + # setup input data + inputs, labels = data + + # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1] + labels = torch.transpose( + labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1 + ).reshape(labels.shape) + inputs = torch.transpose( + inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1 + ).reshape(inputs.shape) + + # dispatch data to GPU + if use_cuda: + inputs = inputs.cuda(non_blocking=True) + labels = labels.cuda(non_blocking=True) + + # forward pass model + outputs = model(inputs) + + # loss computation + loss = criterion( + outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels + ) + + eval_loss += loss.item() + + eval_avg_loss = eval_loss / len(data_loader) + # save stats + dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss}) + # plot the last batch in the evaluation + figures = { + "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch), + } + dashboard_logger.eval_figures(global_step, figures) + return eval_avg_loss + + +def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step): + model.train() + best_loss = {"train_loss": None, "eval_loss": float("inf")} + avg_loader_time = 0 + end_time = time.time() + for epoch in range(c.epochs): + tot_loss = 0 + epoch_time = 0 + for _, data in enumerate(data_loader): + start_time = time.time() + + # setup input data + inputs, labels = data + # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1] + labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape( + labels.shape + ) + inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape( + inputs.shape + ) + # ToDo: move it to a unit test + # labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape) + # inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape) + # idx = 0 + # for j in range(0, c.num_classes_in_batch, 1): + # for i in range(j, len(labels), c.num_classes_in_batch): + # if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])): + # print("Invalid") + # print(labels) + # exit() + # idx += 1 + # labels = labels_converted + # inputs = inputs_converted + + loader_time = time.time() - end_time + global_step += 1 + + # setup lr + if c.lr_decay: + scheduler.step() + optimizer.zero_grad() + + # dispatch data to GPU + if use_cuda: + inputs = inputs.cuda(non_blocking=True) + labels = labels.cuda(non_blocking=True) + + # forward pass model + outputs = model(inputs) + + # loss computation + loss = criterion( + outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels + ) + loss.backward() + grad_norm, _ = check_update(model, c.grad_clip) + optimizer.step() + + step_time = time.time() - start_time + epoch_time += step_time + + # acumulate the total epoch loss + tot_loss += loss.item() + + # Averaged Loader Time + num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1 + avg_loader_time = ( + 1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time + if avg_loader_time != 0 + else loader_time + ) + current_lr = optimizer.param_groups[0]["lr"] + + if global_step % c.steps_plot_stats == 0: + # Plot Training Epoch Stats + train_stats = { + "loss": loss.item(), + "lr": current_lr, + "grad_norm": grad_norm, + "step_time": step_time, + "avg_loader_time": avg_loader_time, + } + dashboard_logger.train_epoch_stats(global_step, train_stats) + figures = { + "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch), + } + dashboard_logger.train_figures(global_step, figures) + + if global_step % c.print_step == 0: + print( + " | > Step:{} Loss:{:.5f} GradNorm:{:.5f} " + "StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format( + global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr + ), + flush=True, + ) + + if global_step % c.save_step == 0: + # save model + save_checkpoint( + c, model, optimizer, None, global_step, epoch, OUT_PATH, criterion=criterion.state_dict() + ) + + end_time = time.time() + + print("") + print( + ">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} " + "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format( + epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time + ), + flush=True, + ) + # evaluation + if c.run_eval: + model.eval() + eval_loss = evaluation(model, criterion, eval_data_loader, global_step) + print("\n\n") + print("--> EVAL PERFORMANCE") + print( + " | > Epoch:{} AvgLoss: {:.5f} ".format(epoch, eval_loss), + flush=True, + ) + # save the best checkpoint + best_loss = save_best_model( + {"train_loss": None, "eval_loss": eval_loss}, + best_loss, + c, + model, + optimizer, + None, + global_step, + epoch, + OUT_PATH, + criterion=criterion.state_dict(), + ) + model.train() + + return best_loss, global_step + + +def main(args): # pylint: disable=redefined-outer-name + # pylint: disable=global-variable-undefined + global meta_data_train + global meta_data_eval + global train_classes + + ap = AudioProcessor(**c.audio) + model = setup_encoder_model(c) + + optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model) + + # pylint: disable=redefined-outer-name + meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True) + + train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True) + if c.run_eval: + eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True) + else: + eval_data_loader = None + + num_classes = len(train_classes) + criterion = model.get_criterion(c, num_classes) + + if c.loss == "softmaxproto" and c.model != "speaker_encoder": + c.map_classid_to_classname = map_classid_to_classname + copy_model_files(c, OUT_PATH, new_fields={}) + + if args.restore_path: + criterion, args.restore_step = model.load_checkpoint( + c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion + ) + print(" > Model restored from step %d" % args.restore_step, flush=True) + else: + args.restore_step = 0 + + if c.lr_decay: + scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) + else: + scheduler = None + + num_params = count_parameters(model) + print("\n > Model has {} parameters".format(num_params), flush=True) + + if use_cuda: + model = model.cuda() + criterion.cuda() + + global_step = args.restore_step + _, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step) + + +if __name__ == "__main__": + args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training() + + try: + main(args) + except KeyboardInterrupt: + remove_experiment_folder(OUT_PATH) + try: + sys.exit(0) + except SystemExit: + os._exit(0) # pylint: disable=protected-access + except Exception: # pylint: disable=broad-except + remove_experiment_folder(OUT_PATH) + traceback.print_exc() + sys.exit(1) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py new file mode 100644 index 0000000..bdb4f6f --- /dev/null +++ b/TTS/bin/train_tts.py @@ -0,0 +1,71 @@ +import os +from dataclasses import dataclass, field + +from trainer import Trainer, TrainerArgs + +from TTS.config import load_config, register_config +from TTS.tts.datasets import load_tts_samples +from TTS.tts.models import setup_model + + +@dataclass +class TrainTTSArgs(TrainerArgs): + config_path: str = field(default=None, metadata={"help": "Path to the config file."}) + + +def main(): + """Run `tts` model training directly by a `config.json` file.""" + # init trainer args + train_args = TrainTTSArgs() + parser = train_args.init_argparse(arg_prefix="") + + # override trainer args from comman-line args + args, config_overrides = parser.parse_known_args() + train_args.parse_args(args) + + # load config.json and register + if args.config_path or args.continue_path: + if args.config_path: + # init from a file + config = load_config(args.config_path) + if len(config_overrides) > 0: + config.parse_known_args(config_overrides, relaxed_parser=True) + elif args.continue_path: + # continue from a prev experiment + config = load_config(os.path.join(args.continue_path, "config.json")) + if len(config_overrides) > 0: + config.parse_known_args(config_overrides, relaxed_parser=True) + else: + # init from console args + from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel + + config_base = BaseTrainingConfig() + config_base.parse_known_args(config_overrides) + config = register_config(config_base.model)() + + # load training samples + train_samples, eval_samples = load_tts_samples( + config.datasets, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, + ) + + # init the model from config + model = setup_model(config, train_samples + eval_samples) + + # init the trainer and 🚀 + trainer = Trainer( + train_args, + model.config, + config.output_path, + model=model, + train_samples=train_samples, + eval_samples=eval_samples, + parse_command_line_args=False, + ) + trainer.fit() + + +if __name__ == "__main__": + main() diff --git a/TTS/bin/train_vocoder.py b/TTS/bin/train_vocoder.py new file mode 100644 index 0000000..32ecd7b --- /dev/null +++ b/TTS/bin/train_vocoder.py @@ -0,0 +1,77 @@ +import os +from dataclasses import dataclass, field + +from trainer import Trainer, TrainerArgs + +from TTS.config import load_config, register_config +from TTS.utils.audio import AudioProcessor +from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data +from TTS.vocoder.models import setup_model + + +@dataclass +class TrainVocoderArgs(TrainerArgs): + config_path: str = field(default=None, metadata={"help": "Path to the config file."}) + + +def main(): + """Run `tts` model training directly by a `config.json` file.""" + # init trainer args + train_args = TrainVocoderArgs() + parser = train_args.init_argparse(arg_prefix="") + + # override trainer args from comman-line args + args, config_overrides = parser.parse_known_args() + train_args.parse_args(args) + + # load config.json and register + if args.config_path or args.continue_path: + if args.config_path: + # init from a file + config = load_config(args.config_path) + if len(config_overrides) > 0: + config.parse_known_args(config_overrides, relaxed_parser=True) + elif args.continue_path: + # continue from a prev experiment + config = load_config(os.path.join(args.continue_path, "config.json")) + if len(config_overrides) > 0: + config.parse_known_args(config_overrides, relaxed_parser=True) + else: + # init from console args + from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel + + config_base = BaseTrainingConfig() + config_base.parse_known_args(config_overrides) + config = register_config(config_base.model)() + + # load training samples + if "feature_path" in config and config.feature_path: + # load pre-computed features + print(f" > Loading features from: {config.feature_path}") + eval_samples, train_samples = load_wav_feat_data(config.data_path, config.feature_path, config.eval_split_size) + else: + # load data raw wav files + eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) + + # setup audio processor + ap = AudioProcessor(**config.audio) + + # init the model from config + model = setup_model(config) + + # init the trainer and 🚀 + trainer = Trainer( + train_args, + config, + config.output_path, + model=model, + train_samples=train_samples, + eval_samples=eval_samples, + training_assets={"audio_processor": ap}, + parse_command_line_args=False, + ) + trainer.fit() + + +if __name__ == "__main__": + main() diff --git a/TTS/bin/tune_wavegrad.py b/TTS/bin/tune_wavegrad.py new file mode 100644 index 0000000..09582ce --- /dev/null +++ b/TTS/bin/tune_wavegrad.py @@ -0,0 +1,103 @@ +"""Search a good noise schedule for WaveGrad for a given number of inference iterations""" +import argparse +from itertools import product as cartesian_product + +import numpy as np +import torch +from torch.utils.data import DataLoader +from tqdm import tqdm + +from TTS.config import load_config +from TTS.utils.audio import AudioProcessor +from TTS.vocoder.datasets.preprocess import load_wav_data +from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset +from TTS.vocoder.models import setup_model + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model_path", type=str, help="Path to model checkpoint.") + parser.add_argument("--config_path", type=str, help="Path to model config file.") + parser.add_argument("--data_path", type=str, help="Path to data directory.") + parser.add_argument("--output_path", type=str, help="path for output file including file name and extension.") + parser.add_argument( + "--num_iter", + type=int, + help="Number of model inference iterations that you like to optimize noise schedule for.", + ) + parser.add_argument("--use_cuda", action="store_true", help="enable CUDA.") + parser.add_argument("--num_samples", type=int, default=1, help="Number of datasamples used for inference.") + parser.add_argument( + "--search_depth", + type=int, + default=3, + help="Search granularity. Increasing this increases the run-time exponentially.", + ) + + # load config + args = parser.parse_args() + config = load_config(args.config_path) + + # setup audio processor + ap = AudioProcessor(**config.audio) + + # load dataset + _, train_data = load_wav_data(args.data_path, 0) + train_data = train_data[: args.num_samples] + dataset = WaveGradDataset( + ap=ap, + items=train_data, + seq_len=-1, + hop_len=ap.hop_length, + pad_short=config.pad_short, + conv_pad=config.conv_pad, + is_training=True, + return_segments=False, + use_noise_augment=False, + use_cache=False, + verbose=True, + ) + loader = DataLoader( + dataset, + batch_size=1, + shuffle=False, + collate_fn=dataset.collate_full_clips, + drop_last=False, + num_workers=config.num_loader_workers, + pin_memory=False, + ) + + # setup the model + model = setup_model(config) + if args.use_cuda: + model.cuda() + + # setup optimization parameters + base_values = sorted(10 * np.random.uniform(size=args.search_depth)) + print(f" > base values: {base_values}") + exponents = 10 ** np.linspace(-6, -1, num=args.num_iter) + best_error = float("inf") + best_schedule = None # pylint: disable=C0103 + total_search_iter = len(base_values) ** args.num_iter + for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=total_search_iter): + beta = exponents * base + model.compute_noise_level(beta) + for data in loader: + mel, audio = data + y_hat = model.inference(mel.cuda() if args.use_cuda else mel) + + if args.use_cuda: + y_hat = y_hat.cpu() + y_hat = y_hat.numpy() + + mel_hat = [] + for i in range(y_hat.shape[0]): + m = ap.melspectrogram(y_hat[i, 0])[:, :-1] + mel_hat.append(torch.from_numpy(m)) + + mel_hat = torch.stack(mel_hat) + mse = torch.sum((mel - mel_hat) ** 2).mean() + if mse.item() < best_error: + best_error = mse.item() + best_schedule = {"beta": beta} + print(f" > Found a better schedule. - MSE: {mse.item()}") + np.save(args.output_path, best_schedule) diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py new file mode 100644 index 0000000..c5a6dd6 --- /dev/null +++ b/TTS/config/__init__.py @@ -0,0 +1,135 @@ +import json +import os +import re +from typing import Dict + +import fsspec +import yaml +from coqpit import Coqpit + +from TTS.config.shared_configs import * +from TTS.utils.generic_utils import find_module + + +def read_json_with_comments(json_path): + """for backward compat.""" + # fallback to json + with fsspec.open(json_path, "r", encoding="utf-8") as f: + input_str = f.read() + # handle comments but not urls with // + input_str = re.sub(r"(\"(?:[^\"\\]|\\.)*\")|(/\*(?:.|[\\n\\r])*?\*/)|(//.*)", lambda m: m.group(1) or m.group(2) or "", input_str) + return json.loads(input_str) + +def register_config(model_name: str) -> Coqpit: + """Find the right config for the given model name. + + Args: + model_name (str): Model name. + + Raises: + ModuleNotFoundError: No matching config for the model name. + + Returns: + Coqpit: config class. + """ + config_class = None + config_name = model_name + "_config" + + # TODO: fix this + if model_name == "xtts": + from TTS.tts.configs.xtts_config import XttsConfig + + config_class = XttsConfig + paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs", "TTS.vc.configs"] + for path in paths: + try: + config_class = find_module(path, config_name) + except ModuleNotFoundError: + pass + if config_class is None: + raise ModuleNotFoundError(f" [!] Config for {model_name} cannot be found.") + return config_class + + +def _process_model_name(config_dict: Dict) -> str: + """Format the model name as expected. It is a band-aid for the old `vocoder` model names. + + Args: + config_dict (Dict): A dictionary including the config fields. + + Returns: + str: Formatted modelname. + """ + model_name = config_dict["model"] if "model" in config_dict else config_dict["generator_model"] + model_name = model_name.replace("_generator", "").replace("_discriminator", "") + return model_name + + +def load_config(config_path: str) -> Coqpit: + """Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name + to find the corresponding Config class. Then initialize the Config. + + Args: + config_path (str): path to the config file. + + Raises: + TypeError: given config file has an unknown type. + + Returns: + Coqpit: TTS config object. + """ + config_dict = {} + ext = os.path.splitext(config_path)[1] + if ext in (".yml", ".yaml"): + with fsspec.open(config_path, "r", encoding="utf-8") as f: + data = yaml.safe_load(f) + elif ext == ".json": + try: + with fsspec.open(config_path, "r", encoding="utf-8") as f: + data = json.load(f) + except json.decoder.JSONDecodeError: + # backwards compat. + data = read_json_with_comments(config_path) + else: + raise TypeError(f" [!] Unknown config file type {ext}") + config_dict.update(data) + model_name = _process_model_name(config_dict) + config_class = register_config(model_name.lower()) + config = config_class() + config.from_dict(config_dict) + return config + + +def check_config_and_model_args(config, arg_name, value): + """Check the give argument in `config.model_args` if exist or in `config` for + the given value. + + Return False if the argument does not exist in `config.model_args` or `config`. + This is to patch up the compatibility between models with and without `model_args`. + + TODO: Remove this in the future with a unified approach. + """ + if hasattr(config, "model_args"): + if arg_name in config.model_args: + return config.model_args[arg_name] == value + if hasattr(config, arg_name): + return config[arg_name] == value + return False + + +def get_from_config_or_model_args(config, arg_name): + """Get the given argument from `config.model_args` if exist or in `config`.""" + if hasattr(config, "model_args"): + if arg_name in config.model_args: + return config.model_args[arg_name] + return config[arg_name] + + +def get_from_config_or_model_args_with_default(config, arg_name, def_val): + """Get the given argument from `config.model_args` if exist or in `config`.""" + if hasattr(config, "model_args"): + if arg_name in config.model_args: + return config.model_args[arg_name] + if hasattr(config, arg_name): + return config[arg_name] + return def_val diff --git a/TTS/config/__pycache__/__init__.cpython-311.pyc b/TTS/config/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f244eacaeb4079105e2020ebda2a52db561a42af GIT binary patch literal 6650 zcmb_g+iw$B8b4!??@1iT2?0XO$pqVD&WwRr zj$~EUD!a-(7ytub=E88h^uf(|(cM;5c%TaPZSzwV`Cb zx)&fjlv&Q!%*CQj_cip!^{@o}fd2=zHGq^{+0M91g)3t0>J5=fLQGAjMLleC^hZZV zkH2y7K4-e+R7{nW)a-b6|6rtV@Yu-oyf}3dn7eBL#SGP7>G5YHEa=nJU0BkCR1&ALDc4kM`i7*uBL}isLA-k`)7t@8BJG{(UdC5 z+VJG$#BfYa#g*CNXjDlldNdkNFZ}2}oDh>&B=KlAL}gKm&TDEadPC9Yq9DkmoYJ-M z;e;9!6WUS8SY12e)7l;&d16E^J&D9tBQZmgi;;LK5;r1QWB%%s`D?55*9@u^=k?OO zj+HmFdGZVf!IqiMxTdA$n8~YYIc4(rWz$Aw)2?N%m^|`eIufcVX(7AmVJyL(&Y4`? z^eCxxMvrPb)o>dGL7*M*HoITKJ>3bdZSdFr1SC&>a1w9(a?i@eg8kmC;n`pG>@Rut zuM^Jk%A?+LXV>kqPsctvee3j+XWb5Utbvj5u*>&I>UA&5J%9fEMb+@#K=78+=zh<* zkcDTRfT4}qz9S@X%Jw~@`7>)i|MJL+QuGNWpJ4EU)(Sk|>3Y-tXa0!mG2xE5OfQ(b zoQS5xq-;8=tY>JdNjDuZ0tNHzk3hi5abV8i(6Ufkv_Z~(m6Ig+z4 zampeeZV=1#b6h^y zX!epTM?NEx`*VA>0r#LumT!5R?aHd<&?lAAfN=o zUN0zsaGcI+BP@rdx`2d1P?OMz@Ggq1ali#p(d339&;eZNxT=q-nUr*rQk9Mfw))3;e zelXWCRD`XrS~iFk*Hv(1GHtG8zOA9MjrEMxI<*Vay_W=QW+usOK!}N{l&T9?WT=1@ zheN#Cv*C+jWUj&~-32^S+-fuU0$GKeRKT_ti`gh!t?8?jaHH4+rUQ*wGkNyH*>Jn6 z2JENutfJ{Mt$y=$=+F&+?GHfmWPK~?+?qdE?$~i>q1Z8)e`noE+#M?)th#y(S5MhJ zVl+QbyL*>f$}LfYJ^7R6_O5lpI|7U>@#{9bC$QG@vx0tay4dq(sprk5b4%x*ulf7H zdOgsqO3E$mOQYZmJOQxmr>-D;Y^|-==)3T63W)U-+a^kF6NY=D?CUo8Zmk74{$2ae zZ4bRi9Dm`CIBK$xAdWYgE5<=)t{*`L%{51^6Y9p;ybWrslmfGnZ1pmW(ACDKw`AWK zj|)xh)dED_4RIUi1QT>-|30Qt(0gWYbv+kEO^|Pb%hqKnES%8=MH55_x+&=mQITo} zswO01rlO@Y4TcK3{{l;`C0nDT{j+jPrlPJ=gimt9^hRqn)DKcJ+oF=9#i){0QrHa) z)J5fVLX62F8+`+MFqPNPc&i#5jHam?lQqqnllf@+3+S*P{@Qy$K#M!b(4L3y7@=eE ztn}RZRVlE`2<-Z5+W6&_(#crLB+$Pa=r5dm*jEhfD+TsF2^?7s95IfL8}D5x z1|~{@3FzkQGWafvk+`N@XjVk+VdnFWsn;5KC;5 z4ODfs(ZAPW9Yz|7|I*P6>i}HZUjpnlM1)0Na^kN!=DZwSrhX0lxGJ$N1+nXeVDl%q z!cDKFd3QCJ`(L>_1Q8O!M-fIadLWW}DN-?*lGzAu15B0oFZ}OsiBEW)KX{2!ID>7Q|#?2E0K+23MpB5WXtW zRtpPb3f1)21%!~ybuiG#`eIj6m_a8qBY^u7Vsmor>IQGee3h=kfC?Iksg%lET1_!$ zbizUD z8iojSxV)5*NnK5;H&TKQeJfmS@|0DGQWu<{UYQ3EZ{ZY*8Wgf+`S3zAk#&So+!QT? zq7qZo-*KqM{m8z`AjbQZk2dmy3<#kP)23?lH5g*LwRA$!<(oR~!fCc(9glOH9!-qP zQQR`Roz-I2bV)M9Jf{Eb#QE`2R$!}Vz~1!Kq+vQUX-Ne9fF743c)%mX4VjvrIE7OX z?s&*e_hBpgZ_|!Up!%CUj1!qZr-N8_S}V|SwHP%pS*Hvi@>B&VC!u*1{@UL_Kf$$c zC&8}#J8RyS<=s!b+gH8Y?;I+6cbB}o?{OvXQ2s>Om5t)4s6_xc~XjKEvPz+0ujTQEoKz*ooDx_fRd6at^*Zsk^Z zz(1`6tbq|=qICe6XsoNZ)^ipUn|eKe{*6DloGbZXTPHTp!8-XSICSq&F?gU9Jg_v1 zzjX(+!z@3ziGQoH_2{D@5O|7h$4YI-4EM1z?|;H?S>?B^=tX`;3}4tsQa4sc;TTtnCmG?sG> zSTSd_g5X5ctxt}Ca|IAkzo(%sSXYlF^>LC7oUUCw4@!Na<{Qz|2qK-$;FSP)J7LDM zkuZZ#QE1U-1SJj_5imTu=Xw|L(JHk(YV8PjT})(}9g{EyR}3;vfnKooB}oPEjz<-1 z;`*#jAI5NC@iSvNugN*Ur|^JN1?L)@6Ef+Fcg4#g<%$A6XaSBt`VF~ypwtAs^uL3Yt62vq>LNPXHaaBFM zqqN5E+e{aDYf;xJJq1b-vQu;>rW;zV~W-n6J%ou}d5=t%sEZGq%`@2@=O8%XNeXIUK!#!B`wJ+^s$TNoMx0-4ijUWGy8ogzcMz_99 zqbJ~ktx;1a$bDP(^eNdoqF5oYrZ4E~tGdvnA}`U+Fq1t+I|p2k8$izjX~Old%*uL{ z#Slx!qSaFaQ+s+I`rpJchk)eCH%x#HG9P}a%#D^XxS5r~lCQ_$dn}Q@&`Ep*Gyi`@ z+IS`W0_Ok+&}Nb6z$wsiV8arXrK?O-7@^})P}OEd9r{V3SvQO@P|PHBdVvWJBxq&+ zI|*)5SfQ%0hI?n(*O4D*f(>;tR6!5m8nyy4oq$c?Xh~7x78rSxRma#JJ4Jmqx8bEo z#nWkXB9*JbUo*S0DWfY1EzFLNN-WAs?3%;^Cw4fDG9+SQ62uG4Mw!a6Ro^F!IR-A^ zDi-_=+(R9u7ojKK?`pk3A@*`ynfUVTS0HC9Qe(YZ$f7AWhFxvfHRW zWio6uKkH5x*Sk(?q+@G5-3 fUK`06*T%ukOO5n7xE<@HM%rCm5P8~2m7*;a-JoJ}0~mn<ncGtE;N3s^=d&JKH4uR??rP*N#Zizf&dnHT#8^|ANBrBwf;D z87Uh}#Ig;Eh8WcwGL2a|A#=Gg)0AyaG-q29EnJr~O13r88k3BM2I;b-H+>=L%|`R% zMoIb#f8zHT|9uryq(qyK+X7r=7uR zT3e#s=)frLT)0ZX4!WOJaa$IKZ`$#30v3xppuy#QjvR9ZC#yZH#MOi35yz zm}@PG5w4AMZGU2vG2h@?f8rp|cf_x$T$|w9(ZnI{H_5d(6JuO^%f~#-n8&zwJTcB= z#{IT8xNVAS(}^S8?}Xnr!EGnG){;2Nd^yFn(_cu5NxcOkG~?|(uQ|rKf^J$u476;Q zB+)xH#bTklmZ}?RGq+->8lkCq+e#UZgEwkkv$d?@8nzQx=Umlv)PiH^)Wyu1uBm0r z`$n6J-_4shmzEae1=q|taULelb1sOnEISU2Hby;XuQ@yFC#$)s)e1X^B=iZmg5r+UuUh#XsIT%* zQ0{2ie8zCpWy8H|fF4c%OiRI32==C|Od*@YM(@Ba=4aAbHL+`1ryj=^4VM;BB@7jH z8RDj^R!+^DnT+WekUSmR2fi>#Bx$zh8a3M@CK4Rod8RP0fBsC4i?v{9l37cK6gsXQ zk-`P+8c_&6ShaMa0A`MP7FQ>1Bb|Y`O{kWwP7pH(e;?fsuST& zXj@L6oSvDk8P7Jvi0aX0%gRI+a51MXXAE^poi!aU`Pj55AsA2#Y*P(UsD&4S7nih* zV^Bj?dGfYlWb><<1D!aU&RAMhgBNNLQkIcUn<*1K3XeS<4=*JRk%oRJ^<|$o>dEPl zf_!A84I75QP-mBgEg;3V4cdG*1irYND>*I0Vuq4si#8q`Dv+`6#xXEP0P#3cacHud zs*Y8FV3N9P#Ss}X7Tu$Gh_p%!?Vv@xmr=q z!xOUWoBxaV^1y|IQJK-Wm&dWss!m=SOK9NEEgRSX%w7;0f7yoFgP>e7v+R7X8kszM zOtzsHU|C=SLYu1Du9?R`8-iZJm+ymm8J#B4AP*`!GhZ@qK5 z>QMQ_tq^r8;j;ygXu#XGXGE@$O+qtw85bDGag2fY>;YlDt1yM^pTlvZRq=b@*L2AA z804c;zZ#tvSmAB^qgrZ!52dfD;p8vWtEf~}J0d7FaSdr^MuY^P&p`E|G&sXxj$xTU zKQqHG(gwd`t~=pF6k$ZpFyQ_{u$IwF|E%D+*dac7R0?n%;}r*?>T(FeE7mIF>cB?* z>_xmDK~)Nf&B+x6*7YRhGQHRGI6$!^R%dZ~x_pL(57zMZgd7ToU6&fu5?N=&pa}G+ zD%ZLzk>dwDA9b=qx`cMNzA&Fv8%aAvf{?PZ`2uYj_M%4SBEomqU)MVs}K>DmK1Ixla)7Y&`G`=HiD=X#_ZGMw~nc|{!|6UVXN`!G`&9-@~)%I z((d_0*K=`k=-k4e&{5iDnlxlo1qtEZJ{|IT$%lDuz7TRgn8+${d@L4uN6QpoCH#d4 zG=LSg{&1L%$Wk3wa~(uO6b;vlgzz3}8xjKUqP}`YiBp2&SyPfF$y0)u)@!SVY+Dl2 zPhML!TAcUxgkwx^ZzL4-TKtg9>kNf%=Gey=s=X$Nx#TH9T=XA7?#pYhiCMOK@Y?IZ zcok*vIzztveI@X5Jtc6bm%P0ZCwJ>Ojnm>g99}z`1INHqg0p!=kno51y(T^}w&TN@1%jKLsVqLB8w&{GH_1wEbFo zjM`U~H@}IwLADYd^w_UsdV}7m%U?G=ZUP?u#4jd=@R}K~MOSvgYh}DPy?qzF4u9TG zeb3i>A0yTLRgi9m42Nd!V$6Mdr@ns|%x=c)(f92#cQ51h>5aSK_50&>?t(YqQ#iN_ z-Voz8>bs3MOn6@(c#M6jrphB+8F>wyQJ(MMuJfs{G1eg-Yi!rC4l~|3=zn8Jc{;*) z`}B!jcy{zPXq&8$qrZua?OQ)|U!X*Oke*GOMn-q+aR~X+7TtZA3Y2HE(QAWoqXQKu zvDZQ>3iFQwln@2G9hh6|gzoK?B)gkRQUk!Fzl=#XQoxe=pV%(D1EopXp22_i+XUVL z@Y*%<@N#f!5&5IPv_JJyEZa8q~r`9xQb!ygdZn;)|>VlO`uicoNO2PNE zvKU)8oGB!*r{JyOs$|Mp)evPQMa_xl*SrpKyCRO*`v=a$OFdUjs>f?ve~*cs{|3X> zrSBD~vwP$8UtRg@D-UA-ru1#xe{|w$&ole^>7uewQWiFq1r(|c&hy!#a=oNn-&C%n zKn=fnq&=Pe=J4}7Mdjm?^6{qfF$!oXcZ`%fI?MgT<(|HB&p>%#e7m!ywe4F8;5!1_ z1itT*TKkLr6HoR%b-%e@Tv#eD=ZcC|QmjqILZLdtq34+KT1mOKsa!*W8a{t?`{~j* zGsTaVipq_Wa${4ufkL(A#B;rJ#-BG7mwr`LJ}D`mY$~6i@O?Ar6KgCgXG+SMP2~&<+=GcOD({w*cQ=)HQJ@I~ z1MBeV&mWz8I$l)XDJk!4D(|4MBeoc(P`r*6h1j~m<9ri)ek#I^V*k-6hn~qr<-L;f z-lp;%3N*voqeD;ea;BurY$`J-i036_K$?8hQK)TK9Z&x&1DD(+x zm<9M!vM*Bk0s%5x4CvT*-hpPW*ZkX-S7J@dw#0C$QR*FB|KLHYC=cK%$phuSq4f_p z8jJD}o{~IN?i&>K!J<4^k_XE}qwDh<7mD&Io{~IT9zM8!ZDX-0AH-9V50(eV)~{}y zD#~McO7hr?{sZg3+~_XK2k?~S1LfW!%%@k!AEfr)YWu*@`jw5t)%Kx-81Hm-{GL%k zuRs5b11jcUD#|LJlB|}yd)F^N06iG8wa>l}E6U-VLWR;ftO=F=2oJhVTFZLm~r~d;8bt9-x0eo|4>OR>x6z2nz83 zcu5{Fj~pt>JsV&F{y$Wb4-ttUZZuU%92ge$;i5cTl80Xm5Q{KCz(UlSML``~FuyTd zp_sNTO5xlCx3c8u7H`XH)7#t9t3b#UiBk%Hl^@xE`n~)IfS+}n-QeG5H}Sn`gMXVH z+-P274nB=&NxgWEcBFWbfUvU|LJ zbcD=`bjJlgb+k=6d^1Y{Hx8vBDS|`_2kDUuRJuZNaaz}l*e+EX$<`ont4SsWIXS;K zqDF+VbcyS_PTbGKIB^Q5d3ZF2L&uwAW3_ideh)Tx^+>sqGjV&XG99!q3L@)fdm`ih zR51FHWx2RftwyB&U>u4@auJask%-We6tUu}g7Y}9hPv~_Eac0&f*5zq$C|72TPA6$7)kH5u*KJla<+y^7kL% z8*_RUB57{cPa(peK~}=g#QL`@0eO{uRU@h03=+Ww4Y}Yt$$%?CTZ-;Y!>mHSHti=Z z^l|32dZr>x%aRkHxKlH2X3x^lHLUgK6UG`86Vm zz5^hNfDdA3FJ4MsIX8bfxpXbL@WHkDi&y6o7eCsWq^k758LY>d9c3r1gQNsz4ly#W z%KUFDus0ajgwWB2TQ%)iofhhtOJ+6aR=vp=YBYjGT+BzJD$>WK%Q?R9;}HnOpO`;O zpi6M%sM%1^J&y^o33nke59JDl4zW5HL>e4lkQns662AIiOi%J!D|W=|@b!r;g(s5= zdwZ*^r^H1&-iox6(H=#9waxG!1>3vzzM=aTU-N#yv{bG&$*dM3Em{tF~Fr?{NpL&{Ij2gKKyc*^MOHQqJQ3N;`MTN z$lL$wa=d`~ho6U7b!s4r_G!CRINqpb=OmaFswrb^nas z5%>fC9KsE&ZU?C@fdkThioZ@(Y<@;Svph4`?&J5I!Gm=?S7t}+}(tq=^du<*;y=bZbCva%8i(&)(e$dgwn>Ob)&lg#OhXa52h z_b86y=m}~_{ii1legjQS8i!0Y<`^cZNy8B27^u?}XZ(obOuS({(*pItNKudAr=}qb zDP@LImKT(=l2TSE#k`;tLrU48RLKiU**N+lHhBib0Jauo!U$D@$^O2s$vyGBjHKkO}|NfMkr1d4&vG0&=mF8^gSvJ|C#gvE*@oW zs6@^aEtArhp{cMo`4lytc|9s@%ht#)Pn3QDZ9dZ4)!#)auO(CK8?|cCx1+yhE1d==rzZGGJ{b0e1EHWRG~x=6@vhT+kQaOtfe6pJ z`h#3YIMe}ut_j!+E{-1w1R*~Vbd80>Q({k7mruAEnCT1&qg}pXvCDJFv$xZ;cmKgd zuJ=6eYd!V){rpr|boqi%b!s|{eTmS6t97v5b-vxz=hlki*lvvR0`D4h1w=BU;Pm7$ zFOWJ@yzd$>h&TWo>##5EA8XguJzrRBIvj>K3i^4`H3jX6Y9sn;K6u5xOjRxbvm(Nr zz%+zsGZVwZC^#+<9Cgj`{%}Zewep>#oi46Jz0JNtuP`e1=mymR0}h0}H(aeF6Cq#N z-IJb4_5JW!z&GKV@l8zg*xLbkTZaARifeBvaf+Z@hoGXQUiQ&Z6B@XH#F zw7HJC2Gp^T)qlZGa2o{N#0ujN1;c_*43k>{H|ros53@K8Gp#UQFL`gTfc&f2-u4N; zNgftMF~j`%#}rE#<~bxN^bLg`R!VV?d7;2qh*Qi{LLeAcwvk!)k|$}gqm*RxlpE%al$BWm( zp{cHup~;ci%L85hX)zp{^aev5FLqtJbg_#M`r%sW!cp;J=hUo#d>5(A$jr{nd*d95 zzXtM$Th#9zl&$=>b>6zp)UGkL5>qELbxEe~%gXJG4L|4aPb?jj8oTAjZmDvgT)8i1 znzy{*QL4TrW|EoOFPXArd5^>#m6@YS=IE1(?Q+E{$z46kitP7EX@y)`AD>+-ZB3T8 zrW$w3jR#_V^KYfp{K#6VJ6Y-``3EsS2A@qM#kN82GZ;MO&k@BG3Oyp|kAb+=i}`U) z07vPlk)xxgj}0RRCpJM6=0;7NAp&Cv4};xe8>Q2t;foYUYd&;O>!@GiM&9 z<}6`N7(;Zt0QVj>=IAwYiI0 z^d&(cn<8&(a^j?K!aqIX3#a)v$lq}JK{gAa04KWmoBoMuB72ax1%3nsBnUK@Cfc-S z2EW2Z5O8+f7NH#e6rYs=0KW> zicRYpSZV5-pctVi#e~C9%3x^T>~lH*QXn%>)XvCJBnkD{;x<3Lm{(UPQm=7J;lpOA zBz_JAScakO)$_CQ-3e2IONj6*k?Zy>U0&Y5vVG~ZBiziN^YNx^~P?dD(nk=cV=U= zDf^Cfd-IyTIdMs{x5@UlB>6tkvad+?PTAg>Bwy_3hw;~?3b$P0PBLzx0fy|?a0=X% z!H+P?C0LAm6l_vs?#9T+EPd1h{l0;0Q&W0NYwU#EUcY%@OaHUBrMEy-z;1@9DQbNH zoBWZs(dTTil{MypF={E7g~qk=y4>t$W)d2v(i(t^e7uv$Vasee&H&|L3|S_Q+P-c{ zeLdaiG+Y(UW6CW`0jT~gPnbmD4I_ZFIFr5~{p{1!{Hm@^4V6YMoFTu>IZM=7e8%%d z0u6g$-lS_ei~o%JF@X3mNBT9sKM%l&D1LuP1dImoYBDh46A-^l`ocosrt3x^Jmvyu zBO>ZL-#(ZHEJ7nJ4G|d+X~hUde)rS=`c>k;|8@D8yHsexXYELKA|VpR3>tKDN-<1M zDz@pMczv4ZBfMe*BoGv*LL#r2f+1m20E3JYb^&ph5a}W`W5#YEN;wSE3uE-EgHsrk zGTs1p6RZmrpD7l<=oLaY6f1}x^2DY|JHZWYh&VNAQ3HRfw5cEhMV$j|KT@j~HQ6WK z*u)WFv0Kz8L)A6J$~T(YmJUcw9=XZ0N!e{}PmxIMj@X%n(NuL!?9@g}*RoY=IV`sv z-lW*FZG=enoeS2uGv#VtcO6=D9a=H3%sy^SUV2Y*U6ox|7s{U49rwyUDch)WE}q?} z-Ep5uHSAt*=vr&&TBfCjZn>d*z2UXBhSydutit^Xx#2{r!M)zFZ>?e9a_{nWso{v+ zaOA1k=&XBA0Rf@d&~(2i5s>Ql%JqAHb>NphE0f6!?~vGR=Mpt1!PkrzptVk^^5PuXO=w4>JF*8L$2ul2++mvXL zShvi&ldKyqRFQW~s%Vod+LBCLp~yovAR_N=nAdw$lpco+HC=`LIpbVLv|*`qiso%> z5N$JaRuGp=IJcBZzx|=q5fEieamLcIkS5AN99pBcTtreJhOAk%QcCH21g7r^YbF>N zrJ4ZD7dL=1wh*!TC85n5TeMk-77UTg8oTGA1!A3n4afWZW3H^B(tNI!_x^A^Fz0S| z4bQs5zH4Z{q;a&?mI@ZEfCR>TQ@mmZE*|9xw=481MCGI2bQQw=g&chlR#Kq{75cnF zdlmYeLXRuvbI80x8$2q;s6rJy$iV?QFh1SaS*-aBAcQ}+*Dmh4ZH>_}Pt2EM%VYFL zrDNgx&uSNIHz~8Nfsj~ls>%`T->9yOdwzCh@yaGemkrP=U7%C;>U-=b?9YAoCzl3S zJS)DH{c`*9)h4O2Pj2j6y&yN9PVP7@+0V%KGfDC#9GfspRV{K=ORPV|mfbyb=gh+8 z+i$5DS>*@M?GNWaOfowPxdHbI;fDYDZrt$2*W!k|bGG8njlx~8-`8^`x_Vh|DFFm; z{WurDmqe{u^q%FE0_hNE zAvui%`5Pd92LFS~B~6P;4cZD>GeLVGqN$wK+G)KlI3PFDpy}!yagcfSAt)jK3CK(3 zffeNekV1K2ESmy(fOLW0*xmN&`^(2xn^rHZHp}}?OS{j=yMgm}ZnSlM=1v|s{n+rh z^07faa5j0~oBY6+m{z>&uO+^r~mo zw`!F=XQW*N@~#1C$G~s;zZgjVU{X34l+OjF9YM(+lI@`+`F;yIGpMsPkY$OW=}QZm z1Y{JZ2axqh%jW1Tt8s>FWYep@$XvCXV|df~%5o08o(YYGwT9sW4ShX+o-^rM%?cSR z>&4AC?1`^bW5%cnu{roSHO5Cb!kh`lps{n_81gXuoH=SP4j1$@Lk#m7ji2%u7Wb&i zh9lqu%=Kt$ZbLxWU}k&MKvryb;T?ci(FOT8?+c50Q2Z+}jVNI+5H|z% zH#p-|#p}ZC*N}LS*pXBssRjbpKC$5Gn~KFd;Tz^B2+|e^6C>A(IB_cHsaV+xFL1*r za1p!4cj+92ZQ@@60bVPi?3H(C?#$f2Ie#-wgU#z(0}oT?`z4^-H{;h6O;S~}T-6-2 zrJOs~ogHh=jwRo+Q*s`aoksy#GiCFo3w?{H;(Zd+Br{D(rU?q(cO*I`=icT0zdR^8 zkHpGTto`m=cixKk#y^zUPMPhD(O{itI*(Nl73($%_#y%>I`KL_q`{OyUt9;#mSlCfz0#;?-h3!~m+GLzN60>6R+%NI;`T&;D9i<1eA8I12>iABgtv zEvT{KY`kB)?(AN3c1zCvvU5KKLCQ&l6h$VfFL7h{21Jqgp%uAo23Xuf|8`HN&U|)4dBI?C>$c>jrH*M6k zg~gTUINri-YvuqP=}$Sp*iaA@B<@?cSRANv5E0UYMg1Mbb}~8)#*z)50~FNQjkDkgz*j%U(i>7f~4Zl3gIot&VirD;Z~{4x>3_kW?>)ZTlkw( z5J?yIV;wz4g?JP3D=CI>NU?DIOu)|z??btCs-KANOvCg{I zSXW}3#5T)p^E$g{joq_!V%e~KPGWmyws)O9yT+b<-23>t#9olu3*alRD*<0|T}iAQ zyv4TiBG~)YrRL9Ca-LT{d&g2x?mQ{k`(%4x%#7H4@XlcT7`Vt; zkIZ@qHjgnx5FJs>)eZ8J0Ay$0f7%srdbR-|gXxpI?wDN!La_U4D5mNHuZ*p8$M$2 zAQc>|0>Dg;RappZwjwhtr5Du$Qn=Y7za^Yq95AQLshsWm8s}?@!Q>)9Qu6?N!6wZf z5J?-PISyiYtO*`un+{hnW1oT%n>nlkT$gtMhg8yGMuZPU{XT(}G#idi0mP`$djOqn3)X~&x7;Wlw+8VpnW{%2;a3CtPkhtO?(6U$!^#D2j zB3#8Q6nKi|90cfmf?^o)=K?aNOt)-GNp>l=L}*nxJ57UUpF<}iGTqkjtQgSR*jBaZ zj-OxZk+vO_w;crRWhL0FRbt#SgE4OPTLgP)2E%_quUlg;@F&v=G^h(w9^T1uP3f0u z4i+v^P#ZJy7i!J`S~$bVx*GX`cW85KH&)&@0hgDNKzG3lZMsd0Rp3d~L{*bhkf|!Q zG{z=MZ47gXg2K&wLNiO)gCeWNrgQv zwf(u%gF_?y`|of8OGqW)zc3X{o*|smO4eR_8OKHeLVuPwxm&=^yzn5LqR54VS)R@> zXNsC&sZ6RnTFhDkUS+K|daW^i8Oz+N_y~Hei-f@Ql07s%y=rF(^SR zN~sU3&5%=iA{>Gtr^?{84srvoayle!3k1{0bt2Wdi>+m^LU|E2;w=i-3zu&|VjE?) z5yD&$o*k1N?sbP}&EZ*YTB%GrJd)$6>^QpaIJxFHx!Nl^PRWi_vC<8Qk-e5EmA1Fb z+uLJIs-h(8pU$=(F=O;4q2L&pZbv%?9wB$G7Z^G}fTy?}?D){83p%r4R$D96)@dVQmV z5}LhNsBN+?9d{8}P`GgjPO#?S1j1wJ!XVgJDebqXD?DnwA7`@(fBAt1@~-k=!7 zWzqR>8q+&B+dnaXGN)4|s3W3=+8QU+>`c7A+$Pn$F4w#cfPmeoYh7|IJEXe9a@}Fb zs2~X8h{W{BOiwcNErJlV6&{BEzJYqd5K~daP`Ik#O#y!jnhhZK>Bt>zyXINc1(TxV zW>CT3dr${XOW%TF5#^HBC91iL$UrfigOgO}#5}b@ViJ)Z1%o9kn8pdeh3cXQ2r*#G zx83#L@y0JmWv|F(ufz;b$|~<(y>m6*6u&H$?UKuOK~_cO!in2AV&$rA4d9^AToW+h zQ(%gHh?z+K4St2Dm=)zdl5H0AQIz||iQQ(dL6hwzL>&?K28YE^H!Iu88S&0wuro(L zBi{-$NRDF}%Xt#%QA0Cm#}Pe62K|syo5Vw?e`H=$Lqf-dpTI3{C=uzVk*4Vs#oi*H z6!l7Si*JgmO6tFxOo*mIgB7PSVSe^0dGuH$bLJ~i5f8Z(RKQHo-%+UV=(1xQu literal 0 HcmV?d00001 diff --git a/TTS/encoder/configs/base_encoder_config.py b/TTS/encoder/configs/base_encoder_config.py new file mode 100644 index 0000000..ebbaa04 --- /dev/null +++ b/TTS/encoder/configs/base_encoder_config.py @@ -0,0 +1,61 @@ +from dataclasses import asdict, dataclass, field +from typing import Dict, List + +from coqpit import MISSING + +from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig + + +@dataclass +class BaseEncoderConfig(BaseTrainingConfig): + """Defines parameters for a Generic Encoder model.""" + + model: str = None + audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) + datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) + # model params + model_params: Dict = field( + default_factory=lambda: { + "model_name": "lstm", + "input_dim": 80, + "proj_dim": 256, + "lstm_dim": 768, + "num_lstm_layers": 3, + "use_lstm_with_projection": True, + } + ) + + audio_augmentation: Dict = field(default_factory=lambda: {}) + + # training params + epochs: int = 10000 + loss: str = "angleproto" + grad_clip: float = 3.0 + lr: float = 0.0001 + optimizer: str = "radam" + optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0}) + lr_decay: bool = False + warmup_steps: int = 4000 + + # logging params + tb_model_param_stats: bool = False + steps_plot_stats: int = 10 + save_step: int = 1000 + print_step: int = 20 + run_eval: bool = False + + # data loader + num_classes_in_batch: int = MISSING + num_utter_per_class: int = MISSING + eval_num_classes_in_batch: int = None + eval_num_utter_per_class: int = None + + num_loader_workers: int = MISSING + voice_len: float = 1.6 + + def check_values(self): + super().check_values() + c = asdict(self) + assert ( + c["model_params"]["input_dim"] == self.audio.num_mels + ), " [!] model input dimendion must be equal to melspectrogram dimension." diff --git a/TTS/encoder/configs/emotion_encoder_config.py b/TTS/encoder/configs/emotion_encoder_config.py new file mode 100644 index 0000000..5eda267 --- /dev/null +++ b/TTS/encoder/configs/emotion_encoder_config.py @@ -0,0 +1,12 @@ +from dataclasses import asdict, dataclass + +from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig + + +@dataclass +class EmotionEncoderConfig(BaseEncoderConfig): + """Defines parameters for Emotion Encoder model.""" + + model: str = "emotion_encoder" + map_classid_to_classname: dict = None + class_name_key: str = "emotion_name" diff --git a/TTS/encoder/configs/speaker_encoder_config.py b/TTS/encoder/configs/speaker_encoder_config.py new file mode 100644 index 0000000..6dceb00 --- /dev/null +++ b/TTS/encoder/configs/speaker_encoder_config.py @@ -0,0 +1,11 @@ +from dataclasses import asdict, dataclass + +from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig + + +@dataclass +class SpeakerEncoderConfig(BaseEncoderConfig): + """Defines parameters for Speaker Encoder model.""" + + model: str = "speaker_encoder" + class_name_key: str = "speaker_name" diff --git a/TTS/encoder/dataset.py b/TTS/encoder/dataset.py new file mode 100644 index 0000000..582b1fe --- /dev/null +++ b/TTS/encoder/dataset.py @@ -0,0 +1,147 @@ +import random + +import torch +from torch.utils.data import Dataset + +from TTS.encoder.utils.generic_utils import AugmentWAV + + +class EncoderDataset(Dataset): + def __init__( + self, + config, + ap, + meta_data, + voice_len=1.6, + num_classes_in_batch=64, + num_utter_per_class=10, + verbose=False, + augmentation_config=None, + use_torch_spec=None, + ): + """ + Args: + ap (TTS.tts.utils.AudioProcessor): audio processor object. + meta_data (list): list of dataset instances. + seq_len (int): voice segment length in seconds. + verbose (bool): print diagnostic information. + """ + super().__init__() + self.config = config + self.items = meta_data + self.sample_rate = ap.sample_rate + self.seq_len = int(voice_len * self.sample_rate) + self.num_utter_per_class = num_utter_per_class + self.ap = ap + self.verbose = verbose + self.use_torch_spec = use_torch_spec + self.classes, self.items = self.__parse_items() + + self.classname_to_classid = {key: i for i, key in enumerate(self.classes)} + + # Data Augmentation + self.augmentator = None + self.gaussian_augmentation_config = None + if augmentation_config: + self.data_augmentation_p = augmentation_config["p"] + if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config): + self.augmentator = AugmentWAV(ap, augmentation_config) + + if "gaussian" in augmentation_config.keys(): + self.gaussian_augmentation_config = augmentation_config["gaussian"] + + if self.verbose: + print("\n > DataLoader initialization") + print(f" | > Classes per Batch: {num_classes_in_batch}") + print(f" | > Number of instances : {len(self.items)}") + print(f" | > Sequence length: {self.seq_len}") + print(f" | > Num Classes: {len(self.classes)}") + print(f" | > Classes: {self.classes}") + + def load_wav(self, filename): + audio = self.ap.load_wav(filename, sr=self.ap.sample_rate) + return audio + + def __parse_items(self): + class_to_utters = {} + for item in self.items: + path_ = item["audio_file"] + class_name = item[self.config.class_name_key] + if class_name in class_to_utters.keys(): + class_to_utters[class_name].append(path_) + else: + class_to_utters[class_name] = [ + path_, + ] + + # skip classes with number of samples >= self.num_utter_per_class + class_to_utters = {k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class} + + classes = list(class_to_utters.keys()) + classes.sort() + + new_items = [] + for item in self.items: + path_ = item["audio_file"] + class_name = item["emotion_name"] if self.config.model == "emotion_encoder" else item["speaker_name"] + # ignore filtered classes + if class_name not in classes: + continue + # ignore small audios + if self.load_wav(path_).shape[0] - self.seq_len <= 0: + continue + + new_items.append({"wav_file_path": path_, "class_name": class_name}) + + return classes, new_items + + def __len__(self): + return len(self.items) + + def get_num_classes(self): + return len(self.classes) + + def get_class_list(self): + return self.classes + + def set_classes(self, classes): + self.classes = classes + self.classname_to_classid = {key: i for i, key in enumerate(self.classes)} + + def get_map_classid_to_classname(self): + return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items()) + + def __getitem__(self, idx): + return self.items[idx] + + def collate_fn(self, batch): + # get the batch class_ids + labels = [] + feats = [] + for item in batch: + utter_path = item["wav_file_path"] + class_name = item["class_name"] + + # get classid + class_id = self.classname_to_classid[class_name] + # load wav file + wav = self.load_wav(utter_path) + offset = random.randint(0, wav.shape[0] - self.seq_len) + wav = wav[offset : offset + self.seq_len] + + if self.augmentator is not None and self.data_augmentation_p: + if random.random() < self.data_augmentation_p: + wav = self.augmentator.apply_one(wav) + + if not self.use_torch_spec: + mel = self.ap.melspectrogram(wav) + feats.append(torch.FloatTensor(mel)) + else: + feats.append(torch.FloatTensor(wav)) + + labels.append(class_id) + + feats = torch.stack(feats) + labels = torch.LongTensor(labels) + + return feats, labels diff --git a/TTS/encoder/losses.py b/TTS/encoder/losses.py new file mode 100644 index 0000000..5b5aa0f --- /dev/null +++ b/TTS/encoder/losses.py @@ -0,0 +1,226 @@ +import torch +import torch.nn.functional as F +from torch import nn + + +# adapted from https://github.com/cvqluu/GE2E-Loss +class GE2ELoss(nn.Module): + def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"): + """ + Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1] + Accepts an input of size (N, M, D) + where N is the number of speakers in the batch, + M is the number of utterances per speaker, + and D is the dimensionality of the embedding vector (e.g. d-vector) + Args: + - init_w (float): defines the initial value of w in Equation (5) of [1] + - init_b (float): definies the initial value of b in Equation (5) of [1] + """ + super().__init__() + # pylint: disable=E1102 + self.w = nn.Parameter(torch.tensor(init_w)) + # pylint: disable=E1102 + self.b = nn.Parameter(torch.tensor(init_b)) + self.loss_method = loss_method + + print(" > Initialized Generalized End-to-End loss") + + assert self.loss_method in ["softmax", "contrast"] + + if self.loss_method == "softmax": + self.embed_loss = self.embed_loss_softmax + if self.loss_method == "contrast": + self.embed_loss = self.embed_loss_contrast + + # pylint: disable=R0201 + def calc_new_centroids(self, dvecs, centroids, spkr, utt): + """ + Calculates the new centroids excluding the reference utterance + """ + excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1 :])) + excl = torch.mean(excl, 0) + new_centroids = [] + for i, centroid in enumerate(centroids): + if i == spkr: + new_centroids.append(excl) + else: + new_centroids.append(centroid) + return torch.stack(new_centroids) + + def calc_cosine_sim(self, dvecs, centroids): + """ + Make the cosine similarity matrix with dims (N,M,N) + """ + cos_sim_matrix = [] + for spkr_idx, speaker in enumerate(dvecs): + cs_row = [] + for utt_idx, utterance in enumerate(speaker): + new_centroids = self.calc_new_centroids(dvecs, centroids, spkr_idx, utt_idx) + # vector based cosine similarity for speed + cs_row.append( + torch.clamp( + torch.mm( + utterance.unsqueeze(1).transpose(0, 1), + new_centroids.transpose(0, 1), + ) + / (torch.norm(utterance) * torch.norm(new_centroids, dim=1)), + 1e-6, + ) + ) + cs_row = torch.cat(cs_row, dim=0) + cos_sim_matrix.append(cs_row) + return torch.stack(cos_sim_matrix) + + # pylint: disable=R0201 + def embed_loss_softmax(self, dvecs, cos_sim_matrix): + """ + Calculates the loss on each embedding $L(e_{ji})$ by taking softmax + """ + N, M, _ = dvecs.shape + L = [] + for j in range(N): + L_row = [] + for i in range(M): + L_row.append(-F.log_softmax(cos_sim_matrix[j, i], 0)[j]) + L_row = torch.stack(L_row) + L.append(L_row) + return torch.stack(L) + + # pylint: disable=R0201 + def embed_loss_contrast(self, dvecs, cos_sim_matrix): + """ + Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid + """ + N, M, _ = dvecs.shape + L = [] + for j in range(N): + L_row = [] + for i in range(M): + centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i]) + excl_centroids_sigmoids = torch.cat((centroids_sigmoids[:j], centroids_sigmoids[j + 1 :])) + L_row.append(1.0 - torch.sigmoid(cos_sim_matrix[j, i, j]) + torch.max(excl_centroids_sigmoids)) + L_row = torch.stack(L_row) + L.append(L_row) + return torch.stack(L) + + def forward(self, x, _label=None): + """ + Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats) + """ + + assert x.size()[1] >= 2 + + centroids = torch.mean(x, 1) + cos_sim_matrix = self.calc_cosine_sim(x, centroids) + torch.clamp(self.w, 1e-6) + cos_sim_matrix = self.w * cos_sim_matrix + self.b + L = self.embed_loss(x, cos_sim_matrix) + return L.mean() + + +# adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py +class AngleProtoLoss(nn.Module): + """ + Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982 + Accepts an input of size (N, M, D) + where N is the number of speakers in the batch, + M is the number of utterances per speaker, + and D is the dimensionality of the embedding vector + Args: + - init_w (float): defines the initial value of w + - init_b (float): definies the initial value of b + """ + + def __init__(self, init_w=10.0, init_b=-5.0): + super().__init__() + # pylint: disable=E1102 + self.w = nn.Parameter(torch.tensor(init_w)) + # pylint: disable=E1102 + self.b = nn.Parameter(torch.tensor(init_b)) + self.criterion = torch.nn.CrossEntropyLoss() + + print(" > Initialized Angular Prototypical loss") + + def forward(self, x, _label=None): + """ + Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats) + """ + + assert x.size()[1] >= 2 + + out_anchor = torch.mean(x[:, 1:, :], 1) + out_positive = x[:, 0, :] + num_speakers = out_anchor.size()[0] + + cos_sim_matrix = F.cosine_similarity( + out_positive.unsqueeze(-1).expand(-1, -1, num_speakers), + out_anchor.unsqueeze(-1).expand(-1, -1, num_speakers).transpose(0, 2), + ) + torch.clamp(self.w, 1e-6) + cos_sim_matrix = cos_sim_matrix * self.w + self.b + label = torch.arange(num_speakers).to(cos_sim_matrix.device) + L = self.criterion(cos_sim_matrix, label) + return L + + +class SoftmaxLoss(nn.Module): + """ + Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982 + Args: + - embedding_dim (float): speaker embedding dim + - n_speakers (float): number of speakers + """ + + def __init__(self, embedding_dim, n_speakers): + super().__init__() + + self.criterion = torch.nn.CrossEntropyLoss() + self.fc = nn.Linear(embedding_dim, n_speakers) + + print("Initialised Softmax Loss") + + def forward(self, x, label=None): + # reshape for compatibility + x = x.reshape(-1, x.size()[-1]) + label = label.reshape(-1) + + x = self.fc(x) + L = self.criterion(x, label) + + return L + + def inference(self, embedding): + x = self.fc(embedding) + activations = torch.nn.functional.softmax(x, dim=1).squeeze(0) + class_id = torch.argmax(activations) + return class_id + + +class SoftmaxAngleProtoLoss(nn.Module): + """ + Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153 + Args: + - embedding_dim (float): speaker embedding dim + - n_speakers (float): number of speakers + - init_w (float): defines the initial value of w + - init_b (float): definies the initial value of b + """ + + def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0): + super().__init__() + + self.softmax = SoftmaxLoss(embedding_dim, n_speakers) + self.angleproto = AngleProtoLoss(init_w, init_b) + + print("Initialised SoftmaxAnglePrototypical Loss") + + def forward(self, x, label=None): + """ + Calculates the SoftmaxAnglePrototypical loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats) + """ + + Lp = self.angleproto(x) + + Ls = self.softmax(x, label) + + return Ls + Lp diff --git a/TTS/encoder/models/__pycache__/base_encoder.cpython-311.pyc b/TTS/encoder/models/__pycache__/base_encoder.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..51ea66b4ce060e7a2bd528863f7dc2cfcfe8031a GIT binary patch literal 8120 zcmbVRYit|WmA*rA$RRltDan#ZNw&uinWm*kc5KIwwUa8AOnT?S5p zAU_s+&J158KT`B+bmuLTR7uu@NSt?=~Upm3K6M4&Px zPML2<++n_*aVNYT8CRB$(-hJ;Gmfk~?l#M=j3>*)nXEVN&HCa#vyIO9vuvC-%kE4d z+ZJ!5h=ZIYg6Ad?7|}f!tZ(3+laL4SZ>4z1r1Aol?GkDK(q8YA1k!0Yy~#w3JqPA)UmT z{Fz)r;8Ti{7n8}__RLp(vrl(H^`GF1dhQ17)21se>ZqpQ*bf(ZQLQp8M3ZimSYFV8142}R*~ z1+Rd$_b(}GCh5EM3jdkf@Ew#u@odRTO!(LGGr=H-^pP<~t9CAOrOfEJ&vY5(<05v|tnGq%6!SsSCcVa)@ zgyBmSq@d%*K`>nThcIzs;V};#O z8vXs`zfL}){$=X7Q;#MdpDMExIy&nUSES{1zA@0q8T{ZKSBoL+ekGLWbGU)ABHC8|nZn)_VzIM94|ZXU#3 zD+$XqQk^xP{t=pN3WAC|Ybt_?yG#{9I|%5x2K(6|T<8$!o0PpR&b)L)r@%Zj!Ygb7 zZR$%o1piGs?)MVwPv{Z?qEBcOSs^F}gpe3WF+w{~w3%uv2+Z4U_}>Kl8n+;Ap@VhwaaDv}Ly+tO!IU~a!+OaFXE zO~srt+9~o7W(eFMz$j=I&zYij-C{Zm)m!eWf$AL4E6BMPs{i!T@_{nL=?u5Z#8#MC znHkiXL9PCn9A!gdi_4X zEzIL+&I2{4t>&`T=w{8f@5a7hyt_&B&~pzE{sH{gb_u6yS_Qz4%#`8J%c58_qYS^2 z$mRhpt#kNWce&8<(OOE2TK^V#KRzYKa}MI z*x-ecLT;Wk z9dI;d9x{Y_K5JZH^}d7k_ge2(y?40Mv!irc?-{8ENMPSr#1m-y8ndrH48pf@dn2?&rSE_AitS)=(4pjL2}>tsFH`Lp6rip!(R^W|Vv4@Na6YHt3?x6uCt zy{Ke^JE20uzbTu407=AC%t3>dU5i$?G8*DDl(O@D;)6N4=(h0bOUaC__XMu4?VN=rp z0Vm;YcrJ-z9^3s!TLC-;OXgAD<=3&FR%R1<5!~9m!DP~sl20T>! z$#5nUs{AGnaAn1W6brP-g%4pb1u!InBaCJ73Ye6BSyT%$f?Pr}w`6$0Ek>v`Jh@Z~ zykW&)tx>#re#31w8$MW-ua(`kd40`0Q_zpW`)njj5Zgi1=L;%0L-p%j*xV|R&vc)F zc@#HD&~;L6BWy_bZ>bW>J3&<(FCX-!)#2j4z#D@94Y}UEX$Q&yr`|-*G#> z;@_(Iw^kzD(&;ZdHr*LqPLz7?A1!Y?`1r8aF;?yv(>un1v?sE}R=PKXsps8oW=j)m zUE$^KpI^9h0g93S<#VOE^5(te$UZ%?Z|QWUtNZq4{9E2t>eYG&wVpw}YiMbr!iHDb ztt;%-lDo`CbvCM*&zC*@cdnKCAMMk64wick>OBWxxvt3F4}bAt>HCkPw?8a*y{UJ- zx#U~-cjL*%skO+KPkT$JAB{d%wa8dGGNwnyVA)_qVg zPM6%SB1qMe;CQ&fMPGDE=WXL$ma`@YRRp+NvYSXS=5^Z`x;Cc26|8^)f!<85ihd)g z!)!gRASHNO=UxEFwL{5|sG`#{2V67at0MS8R#3-^+JNt#71i0CVE7Uk2Fha?$Y5(U zR*iwz97KAu3tOjWMQ%XhI4K9FsshrUfrt?Y0fu}|Rz(3E6c@J02>dc$h>Yfo!I~B> zpHF~bDTa6aYEsOjw-$3*qW3-$`BB_VD;Z?Zn*=KuEa9A_;H{u|SdtMy{oUwXh_p(D zt?METQ;LvXClx$ThGR{VXMzi%zf-G5IDA1a3r>ET05r-7c;Kc4&be zYvEnx@NPZ4yV>*cN?46KD;xu19_J&xqz+~Y+9qJ_s{g<2|lS`AeDOcZGid_m9vpyokC zVN*n!-$B%d8X-hQQ@oZ^j-5nKf|7EAie@0BQTO?vU!XV61iNd!h595L+zwF17=fxK zP_?$btLU)T(uVVUUwHZr%AO)0d*6fL+9LBYBQe%{o|-3Do$rx(sz~4Xf#5EBt`K<= zZj^rEjkcKV8&4_?f5)BmE;6>e@)Q}nDyPUxcIcGjMf&>r+AgP642jedm?&@}`_0@q z0XNfP{M2dD9a5Wq2cjk_Ei>6qWZ&B&{rBAp7$;{WJ5DIdoFZB%5KlqEqn|Y2Y$%^D0t~v#Mt z8kT2aFAD=ecA}1p0KEx;OV5B6FACg6F_{1q0lVv{5ynlJK$;f7MzV?+8-lQzD&}MD zG76w&3%LdHN*b)Bpc%kdR50nxn7%>hWi*_aN%7jaEJF;KeWw72OO1 zEt-4qS$YJ2bWjj%P%T)=QV!NBnjpK-*Qy@OBQ`s6C1od=#@ESTJ4k!~@(=!F;EzL} z7s^M^=ts|J@5Z&zg>vYE9=fnZR~@Xkufm2_S#E{pN?oP--)ZwMJ*cw>HTGa7via9r zf4Q|B*`r7HES+A9>?$ohI{3Kf53m2{KCs(#G~Jpz=qrkGZrH>2Ylz3A9(89`DihP(%xDtAzTN3tDI= zJZqtzWw*v1eLPkUjq0IMEp%o*+`k&$vl8BO|FwVL|LBT-=v;aK5B2>&EQinQ;q#!N z*{)S~V1*qhiT5Y}=Lwf(vZST52SS1ecfSt0Z@6Mve?!>2ar}c5XLzT|{WwA8zX%?M&`|jz*mq)(d@`R8^T$}wshESwWC zQXYeBdfIZwT@w>i6S0uI3p=8i?S{$?FhlH8;Mc{CQC@ab6UVr!I2RHbeeAPkG;1ATZubjl)Rd03M==$m2Dye4;I&3)Rr6V9DbY%I8`P!Im zA>DnI$d*b+ccrbf8a#m1kgbQht29>Jq<3@GgC&N5gn6;#Bb_}}KbDa6<(X-vnSsk# z2gJDwS@`K`5--yHf#E2W(1ODsO|6?pi`3|V->-)2zg3~pD=L=7i*}o3_Vyr2K}~0r z;TiOo(@EZ}nTTny&E9m*gk-}F`T-*PR#+ZC*k{Dr8~#M`EprUu54vBMe+pGJz7))> zPKu%`q(iGe6~bOO|0<+it3MSI(d{C!!3H4?7oICc|4uQ5( z?>J}fy>srlf9HPZoST0%G}IADQ{kC#e*+=^#zHw+OPP(&AajdIMB<`k65pmMiE)!0 zhk0|<6f;knIbtI363KF%NLHDfv6O|}F%xnR{`53yWn4DkvcH1M#<+Oka>(XQ66_Lp zh6L-Lpuq7Urn1A2Qq(wX~<-flgOkgL&BWIUFRpw zlIc2`v}DL7Zqf>(&06#5nTfFvBkJ^NnwXJ8Y9tX48506AxG*@Sjei08D|d^i9ArxA z4!rM`3nW|h#i%5;JRC`LX(E(IO424M8B3NnL&@k}t*=O#v6b`C-Xw9~m}-{_+S$t- zHSPFvE%aoT%->jEXy+(%NCZ~IIzE9*d|p&T(?U2x6?OP7r&-QNL?u|KS(Q{$rW!8@ zk$6NE1g$O-Po`8siCmVo`ss)y$#K2lNWyxuEXC<*7F=h|5>?cgW*d#fWsz#OD0>I3 znnjVLVG2q~Xd{v)Y~m1vP*hYDK~QiP0+6o#aJ>JlB2%S*R#c|@PsqwAY9iTxJP`}e zo*n5Ar4%(06XKv5O8>;fnSNQTFAz5?&d@xG#OPsXy)RBZ3gSPFbw(Yp(ZjccCP#dnw9TdsfT$$5tJ zp5cOL80sBst!;~uZ2j^_56|6gTR^Un zBJW&qtnzzS_&s^PyTEtn`0k>o?bgVbBTM2Jql=^Sjtn>do6NDyg<{jT&#&CLvNSM% zb>V7`-$A#7gu6D>**FXkbyg5SA5u}wI|boFN{s4Xm|oBwIKGCG?uF!NR1ofRv;_x) z3-=D20LAmj2p&6S6bPtF`B^vFtS7j2@)GJYTm9?gRfG+iiANixV&TQ$u%9-w25G zW@h;2F+`-<8?#HEuiL+B&;D+C=b->&9WIs`jBAr1D60%%iejcYsF)uG^$V zs+Szs!KY%Ll?_(jse4vulCt=TOiSL{69O}Pf>`qe`>M~0)i|{al3?L zHp(EQYBkGTu%3{uU_FK_`uWGR6l|_wxYm|ZWPt(lMR5BC1Z3Us*BbGQnxNqMDoHt{ zImMI|NeH1tJRF&#`#_Lp6O&0fE-}-rS*aMGl6C*K?u>jPCCAl>7zMvpw;_1Aw8oLR zqKfg5Jf5Jj0ZDU4O653|wVhM4TIw<;M_IQjO-z9+rTe~GUG;c$uUnTMtY>D1?uAa+ z*y!t+>qRn%KkUi- zP8NJ8^Ul$Nb2R51U2E|zy}kT?zP&Hs(qCxlUng9{Y4E9#-sFm|)>{olJ6KSbPTewKCW*n)T4Xm*j@DRTJ;aD_=g@I$@@mbO%uIo`HlMS=cL?nWUXBkz`!DdNiqsr%!0?Eug4TFR{4^p{h~Q{Uj0#}4SrB2t=9-m-7z(xo&DH&* z<50M~6AkocwPgXSepH5mNl}1+iEiI{`_0w%gDdR^^X-QU?T5g6I{FxyKbbjkv#Z$D zvgFBJ&0Jk`H80v%U7ahg&TQB6w!Evi;Ofn>U(xMdn#j33;aB47UU7A2QwGoTbw^pM zt_?WmazgJE%NzS(KWgt2W(=*VMYA`+ZmQ+>@SZVtvVyK#?>+OjL24!DLs<|}8v9H4 zrn#-HoMhg-+km}>Z(TJUupG6Vb8yFzt+wn2^JdPhar?ad2h?df z!0tEt>gpJx135;gl*2{|YcBgWJhPD~!ewIExmn*Et+ zF)44Fe{bN@;lSaTs7@a~*E7B^Ft9H$AzX>X=Yr>O?p0q-2v=~3a}@?-Xv1(dW|!mQ z`KT<>gRoss@F0%UexN~<#w9f~gm%Fu&B~TXv&Rzx*m6m;gi?~InZ;OAGI^AuTJm>_Al6M^`xQ+l@ zlPkk3Z@?}rb`K4j|9Px^sMGqW#R>UGotB|4$D=MY=6g8I2U&j5GIYfL=xq)t*xFw~ zTl#AxC!H6D5g0U;zg=9C7>agHgt>Kl4$dgx?8fW zp?00@0HFl!SYv2m4&Vv6gMn8AJAo0*0?wa5g1|t2*h(iN8W3P$;{g6xK!zvCV1APK zs>x=z4%-VXQj}hIRlRyu)%EIKRlh4Q_fn9C2Coigwoud`FjGv7` zF5}Y{qvG^sn#Ol~%oev_w$qf2>Y;eYXB6)g=&Oz*k$ZNEx)1+)y6hxaF2HiX0?S3P z7{Kxf_Ei+zJl#iyyw8zfZ-~}xi3Gs1!<-}>OGJ{qAVvzEkw>xXmpdW*w=fNXx=i!b zWgG8EQ-d^b`^0w|65vfM-wA4E+T}=G^aE+B8VbRR%629Xd)_y!y>MV=I`dY5ji?8bdL|6 zPbOnsyvA_jL*Ws4*Svkg=$Md@qg+gLorxv{PShM1g)^66y`w|%WR%xzgOQL!b4WsL zP;*CyxkN&UX&zDF$0Bkxnb17pa3sb_QaCK(sDr`a%^#0-T#^J)>bSv4!yQKj>6)Az z={S;%58k+RvLiAk$;o&)0gNei^!N952s)E?#Gxc6b%=tL5ajle8)7-ooAebJue~@e z{BF`TfFEvB%S`#NhvsC3saKi$C8l|iX;zpPm1)T_Ez6th)Xl967nIE%>gJBLW43&` zs%E}znXQ;TIX9HKz7SHjy`!*uRCW)R^<<9b*d~Q-QrV^gPIikD*uBtG!Z|g4X7
)rdmbeqB(kbU8>^$I1Td#;qLzpEx zDTtMj(H!GZ;ksCZUdaq-jLxndzD9o<22u>ejiLbD8eUs0E?YH`h5mmS)8|=q5yqxSvBV+)xQFE`-CH zHyj2g9gE>R8xD_-aWTDyu&THN@U`;7euxOMSUCJGEpCQV2^aGNGDEUS2|%$TodE!h zT>P!r7^ioI+6dG*oV6>|s{k20As+$CMk+4-%iU095&AR;dlwP>4iNkf-U))=Nd&(O z7S*k>eaAqPL?ee|$;dUb!oB3fL~!Mb$iP z>w}cd%JCJf$c>cbjaiFcCBP?x8<5}CAA^$BcO48o-z`bUK zWyP_`xLFDb#^to`b{G%JPLfg}OTraK&JH=;nIsX7!t_2z?jTEh9CmNnH9-U!AC_Pf zYc5F^qr9NGM>w93CWbV}K$Me0cCihcbRc*eK);CN4*5hDOOVI(9f5>3*O6pm9Mq08 z0?$q~OytOLZ&HjynSCJ9saJI2dzV;+EgYf{8`HI!O%gOOpS+%sxcEp+5Zj?X?`L-^ z)giSy1O<%VU_3kisPi%RM99_lE4BS$GaFK> zTGgsnC~)WN-hxMEK2Q&jvNZh2*0-o3{QUU;QiN@ZYG}#^Y5)q>rqzmN3K(Sr6|~9( zBA907XAlP^WKmWG6UVbAt&3@FJTv;bn8rqY9p+VIL;iIw%+JVK+O1Nt@XhRCA_CX^ zN#DfYM}1580;_fbad97lJqX@KuouC81jwu*2jOS25#BWh?4KwPnnklG$K;|-rE5o` zVJ2%VtXoizyP&Of2LPzY>Y6n3w4& z-hTT)d*`0cci-`%WF;O3G#C_<@lul^7|7T_dn6g}h{TfP92e~vPfmbYBn*UQkppK# zB(0x+0ADf+&NA+UR(OAo8~-`(O_B{co(J*$2$tSNKmjLcc9`t(-iV<^p~4IQMBM!y z2!ubs6_NrQI&6q9us*1;A)-<)$QfcN<%XOgTv80=4BG;ugC3&GO!n%0KxU>zaQ^%d!Y zVD%T@%}*J(dm*iA4K`e9w$|_lP&UT-hLq_NEwBzvIo|z%v3f8-9i0=B-jKS-Lfw>G zmpb}9e+qTGg}N!XF7-f>x@9fV_91DtQis{1Nu<|NC1ogXg}IkHE~3CQ#jN1~6uBfN z+&si|!bV<3C09r(bF9WY34Cg$Gv%?2xW3rVQi|3XNx&_pFOl+~$H|w$j$p04Vrgyc zf_p&Yet{B2S#VI}lEjx;$7jJZ#=)1ZJCawAW1G>3=Rvu(mo@u2Sj2_zXGj{p;uDbT_-HZ=-ppWhNMk@>Ud1LP+$MG+X=3uBC!^ba@8BCT z{tl8pBci~`LO6!;RN^dYF7n=4cy<+@?-ZVUw2G4nN#+s}0o|rfUSp$$JnDCC+mIj^ z1{g=j4F)NT$sv)8>!CT#YZ|>hnvg}5)w*8>#7YhsWZZqpK{?J%XdX_MF)|1tKJSNI zY)m*Nib+vK$xh-q5C9=zMly0K#OfyOd93ay)b$FjEHM@j4@P4$1O&x%SdHeIco9J# zB9?)*66Z#Af71;a@-d0)hFe&7-3T=MZw>3&M}{BQ4GkrX>VjGL-vP=~@$VJev*5~w zR3@~KR?>D`bM?BZ6T`E z?^o;hlTQSezgpI~LfJh#@{QY<8V@fv9)A3u(s*8NJfA)~doF$MY2$W$c^24`dFyWb zLZ?#Kss!59KwG-(*_N$$8}z{HwQSRTBKxTl=z4hM@r4`<5!N1+?IBUv>65c3@ufEd zvDs}ZyG?Hr$sEd#-hF@Jw6b;IswSs0ojJBaVH;Go;YFFN4q~$a;Dc1G_$bz=mepsS zi)GEZvgUjM0)mB}(>>{)e7RpOZ_G9?mbc`}TgEueWoYoH{T!z6ANvK4~jaS63tAqXz=@IKkJzJxOg-A@5c z`WgWE{QlZI^|$N4Xq<=r>GZS_knUct+>&Yf>f+r`l*$&hvIRoN=oeNEhuT}N6=6tO`tdtdpqI=W0DiJdSVgcC_|3S6{Pr-?TgQu?%c9*Kb zt^ou+f!{=5HMSF4fI61gCz~jitR_pT!7#6;sQX>O{6MkfXIUs2eXSjj)rPV%@RW0i znsTLF>xZ0d!2GL5=75+kjJkBzVx>+~Yx@UVtoX4g+3?fAQid40bu=JUc22q9d^Fb9 zt4A{1V);Ubs7kp$*H0`9p=Gmyhe6{IBceSdet-a55wk?MpvY!%aU&x_0s_Y(mlzTN zog5%J2gpgpA!wkv$h$f2DyelJ1E2HVPPlKl!>{}BM5G;K+H^JV_I z3p0n(hd`&bE$n@G?7MTyPmV0s9m&-lSteQyc9e%y1|Bwl*P(>EpN!@{3Fii{<$N*4 z7gK#P+$0s5@df9DN@d$V#kXJe?SJ94*HwH^0Z5-(@lsVaX;+@9xK%z=J}1lxnWJ~l zLT7s)9aI9x)xdFu=~0=U9MhBE+?cKXW}CA4ZBReu%l^RJXlAG4Z&LkDD-`Y7Pe8gm z&sLH1u;xs2*6~gG!njhmSFPKtu=`YYUyl4etMt!KWd?JV%{iudt-eB5N6G&3GXU0O z{nAFcP-MC29CbDjeOR{*n8a}x7b zD?1)ba&o_rkibq&B2L8w2j`DsQ53_m1#*yx-lq6701{FH9?_2;%M-%&@JNyBbC(#wY+1OX_s#{wZ_}tcal!`rS#h$c%neooLmzc&yrZFQa%np^=kz;n` zjfkk_CWT=j{0KXL<5T0+cOT&6P^ASCF|duwWKCKf4VXfigI|CoaZ?Hu|0KL;q4%}P zYL;NzjZ6uvd{|%Nt)2s+XP6P@-q#!TMrz;MGofi10DB7uJLp0?iSXbFC~$Hq+Au{Yer2J=6evyn$&?(H7B$7bEv2Rr@vceG zCmW3mY!*^K2|ScFB7Q}h_~bngyjJeC=BAvMo{hL3oZdPI;naEccYw7sqI~9l-f&G# z(KLRQsZCbZh!R?9uNpI)zl$yAsP0vRRjwLQ!R%Hujp94V8W+Ux%GPicUS(ssPr=YT zu3rQRj=<$2%RxuGmz*1*ZtoU{q=QBdOD2N5bPW(X7`(_`4_@cS1vny$2c_sxf{O){ zA|8Ph<0Ein0Xjr#51t%^;|>V+1mP%zGa9u9uaqJuSAuc4xfC1_f>-**G2x1(TkKj+ z2H~&+hzR6R#@tOZ(FW%QPoCG;!#L4e()Rv-rDz2%+p!Ud!9 z+46LGF_c0>D8*NEr}B1XmR5Y*Rp0h>Hy*IQk5*h$-d{D>|HbAz6}KxCUr_Z0(X8_= zu?>rCLxxk>pvnf9*xiflZV0uoZ7SQArk}z!r&BYh=0>N_&Ysn8J)N96ImhAEH9dP` z=EU5E>C>~Pb>!1Cr!(|)?`*GL`^MR)Z22uh`Oe_&!7qmAhclz|*RoyN(cg`KJ^tH? zZzdkvzDenK=k$(xXL>U*OpMb&0mhPy$jq{@7A)wd4EJSo_5$%nTU1|5x*O(eiP3eAg$7U+e}ZygWm-%dUp%{3fQ8V;xp2Ob@M zO#jm{rJ;MN{={Pai6{NJ`V&h1MYaB-;_p-aebAot-<1(FyRzWf?o@p{VI5$C$7hbu zb>7)~d+!(f=l5s2=MQCV3cEvPcjzC>xKD-?s=_t+fqU5bs43@b zQ+)5KzV~20mA*7XY;A~Uqj3_VkB7XgPr}{dFrSRT6;3n@NH{=5=SBPlf*lB25bOeA zzFKw|3lX4mB>f`*)3Nx9&E{mlmj_TmRoCYIHTlY|=~OzEui2R+fBD+FeATA>rh0ID zR;u?q>sP3afK0ob5M(fbufn-=g|dR}c06M@fVa)*U!h(L^hg0t0vtZv*7~m91~*09 z3&Hw!a^5Yq55NuQaG^{@qg8XACD-%xJ9fLVF%tTg(utyW`jL50kK(*9{syubP>}u= zz>1xw={&{WB)>dWle7M Model fully restored. ") + except (KeyError, RuntimeError) as error: + # If eval raise the error + if eval: + raise error + + print(" > Partial model initialization.") + model_dict = self.state_dict() + model_dict = set_init_dict(model_dict, state["model"], c) + self.load_state_dict(model_dict) + del model_dict + + # load the criterion for restore_path + if criterion is not None and "criterion" in state: + try: + criterion.load_state_dict(state["criterion"]) + except (KeyError, RuntimeError) as error: + print(" > Criterion load ignored because of:", error) + + # instance and load the criterion for the encoder classifier in inference time + if ( + eval + and criterion is None + and "criterion" in state + and getattr(config, "map_classid_to_classname", None) is not None + ): + criterion = self.get_criterion(config, len(config.map_classid_to_classname)) + criterion.load_state_dict(state["criterion"]) + + if use_cuda: + self.cuda() + if criterion is not None: + criterion = criterion.cuda() + + if eval: + self.eval() + assert not self.training + + if not eval: + return criterion, state["step"] + return criterion diff --git a/TTS/encoder/models/lstm.py b/TTS/encoder/models/lstm.py new file mode 100644 index 0000000..51852b5 --- /dev/null +++ b/TTS/encoder/models/lstm.py @@ -0,0 +1,99 @@ +import torch +from torch import nn + +from TTS.encoder.models.base_encoder import BaseEncoder + + +class LSTMWithProjection(nn.Module): + def __init__(self, input_size, hidden_size, proj_size): + super().__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.proj_size = proj_size + self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True) + self.linear = nn.Linear(hidden_size, proj_size, bias=False) + + def forward(self, x): + self.lstm.flatten_parameters() + o, (_, _) = self.lstm(x) + return self.linear(o) + + +class LSTMWithoutProjection(nn.Module): + def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers): + super().__init__() + self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True) + self.linear = nn.Linear(lstm_dim, proj_dim, bias=True) + self.relu = nn.ReLU() + + def forward(self, x): + _, (hidden, _) = self.lstm(x) + return self.relu(self.linear(hidden[-1])) + + +class LSTMSpeakerEncoder(BaseEncoder): + def __init__( + self, + input_dim, + proj_dim=256, + lstm_dim=768, + num_lstm_layers=3, + use_lstm_with_projection=True, + use_torch_spec=False, + audio_config=None, + ): + super().__init__() + self.use_lstm_with_projection = use_lstm_with_projection + self.use_torch_spec = use_torch_spec + self.audio_config = audio_config + self.proj_dim = proj_dim + + layers = [] + # choise LSTM layer + if use_lstm_with_projection: + layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim)) + for _ in range(num_lstm_layers - 1): + layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim)) + self.layers = nn.Sequential(*layers) + else: + self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers) + + self.instancenorm = nn.InstanceNorm1d(input_dim) + + if self.use_torch_spec: + self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config) + else: + self.torch_spec = None + + self._init_layers() + + def _init_layers(self): + for name, param in self.layers.named_parameters(): + if "bias" in name: + nn.init.constant_(param, 0.0) + elif "weight" in name: + nn.init.xavier_normal_(param) + + def forward(self, x, l2_norm=True): + """Forward pass of the model. + + Args: + x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True` + to compute the spectrogram on-the-fly. + l2_norm (bool): Whether to L2-normalize the outputs. + + Shapes: + - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})` + """ + with torch.no_grad(): + with torch.cuda.amp.autocast(enabled=False): + if self.use_torch_spec: + x.squeeze_(1) + x = self.torch_spec(x) + x = self.instancenorm(x).transpose(1, 2) + d = self.layers(x) + if self.use_lstm_with_projection: + d = d[:, -1] + if l2_norm: + d = torch.nn.functional.normalize(d, p=2, dim=1) + return d diff --git a/TTS/encoder/models/resnet.py b/TTS/encoder/models/resnet.py new file mode 100644 index 0000000..5eafcd6 --- /dev/null +++ b/TTS/encoder/models/resnet.py @@ -0,0 +1,198 @@ +import torch +from torch import nn + +# from TTS.utils.audio.torch_transforms import TorchSTFT +from TTS.encoder.models.base_encoder import BaseEncoder + + +class SELayer(nn.Module): + def __init__(self, channel, reduction=8): + super(SELayer, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Sequential( + nn.Linear(channel, channel // reduction), + nn.ReLU(inplace=True), + nn.Linear(channel // reduction, channel), + nn.Sigmoid(), + ) + + def forward(self, x): + b, c, _, _ = x.size() + y = self.avg_pool(x).view(b, c) + y = self.fc(y).view(b, c, 1, 1) + return x * y + + +class SEBasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8): + super(SEBasicBlock, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + self.se = SELayer(planes, reduction) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.relu(out) + out = self.bn1(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.se(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + return out + + +class ResNetSpeakerEncoder(BaseEncoder): + """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153 + Adapted from: https://github.com/clovaai/voxceleb_trainer + """ + + # pylint: disable=W0102 + def __init__( + self, + input_dim=64, + proj_dim=512, + layers=[3, 4, 6, 3], + num_filters=[32, 64, 128, 256], + encoder_type="ASP", + log_input=False, + use_torch_spec=False, + audio_config=None, + ): + super(ResNetSpeakerEncoder, self).__init__() + + self.encoder_type = encoder_type + self.input_dim = input_dim + self.log_input = log_input + self.use_torch_spec = use_torch_spec + self.audio_config = audio_config + self.proj_dim = proj_dim + + self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1) + self.relu = nn.ReLU(inplace=True) + self.bn1 = nn.BatchNorm2d(num_filters[0]) + + self.inplanes = num_filters[0] + self.layer1 = self.create_layer(SEBasicBlock, num_filters[0], layers[0]) + self.layer2 = self.create_layer(SEBasicBlock, num_filters[1], layers[1], stride=(2, 2)) + self.layer3 = self.create_layer(SEBasicBlock, num_filters[2], layers[2], stride=(2, 2)) + self.layer4 = self.create_layer(SEBasicBlock, num_filters[3], layers[3], stride=(2, 2)) + + self.instancenorm = nn.InstanceNorm1d(input_dim) + + if self.use_torch_spec: + self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config) + else: + self.torch_spec = None + + outmap_size = int(self.input_dim / 8) + + self.attention = nn.Sequential( + nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1), + nn.ReLU(), + nn.BatchNorm1d(128), + nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1), + nn.Softmax(dim=2), + ) + + if self.encoder_type == "SAP": + out_dim = num_filters[3] * outmap_size + elif self.encoder_type == "ASP": + out_dim = num_filters[3] * outmap_size * 2 + else: + raise ValueError("Undefined encoder") + + self.fc = nn.Linear(out_dim, proj_dim) + + self._init_layers() + + def _init_layers(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def create_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + # pylint: disable=R0201 + def new_parameter(self, *size): + out = nn.Parameter(torch.FloatTensor(*size)) + nn.init.xavier_normal_(out) + return out + + def forward(self, x, l2_norm=False): + """Forward pass of the model. + + Args: + x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True` + to compute the spectrogram on-the-fly. + l2_norm (bool): Whether to L2-normalize the outputs. + + Shapes: + - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})` + """ + x.squeeze_(1) + # if you torch spec compute it otherwise use the mel spec computed by the AP + if self.use_torch_spec: + x = self.torch_spec(x) + + if self.log_input: + x = (x + 1e-6).log() + x = self.instancenorm(x).unsqueeze(1) + + x = self.conv1(x) + x = self.relu(x) + x = self.bn1(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + x = x.reshape(x.size()[0], -1, x.size()[-1]) + + w = self.attention(x) + + if self.encoder_type == "SAP": + x = torch.sum(x * w, dim=2) + elif self.encoder_type == "ASP": + mu = torch.sum(x * w, dim=2) + sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-5)) + x = torch.cat((mu, sg), 1) + + x = x.view(x.size()[0], -1) + x = self.fc(x) + + if l2_norm: + x = torch.nn.functional.normalize(x, p=2, dim=1) + return x diff --git a/TTS/encoder/requirements.txt b/TTS/encoder/requirements.txt new file mode 100644 index 0000000..a486cc4 --- /dev/null +++ b/TTS/encoder/requirements.txt @@ -0,0 +1,2 @@ +umap-learn +numpy>=1.17.0 diff --git a/TTS/encoder/utils/__init__.py b/TTS/encoder/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/TTS/encoder/utils/__pycache__/__init__.cpython-311.pyc b/TTS/encoder/utils/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae8731445fea368b7141b7d803d8099ae6abe356 GIT binary patch literal 179 zcmZ3^%ge<81nFs6X(0MBh=2h`DC095kTIPhg&~+hlhJP_LlF~@{~09t%S}JDIJKx) zzcR5nL*FH}IJ+djK;Jn(H?1<%Q$M-1xFkO}J}*BdwOBtSBv?N+FB!-#(l0H^%qiB7 vkI&4@EQycTE2#X%VUwGmQks)$SHuc57G!ZTKalvq%*e?2fdNJoF$2W_MqexQ literal 0 HcmV?d00001 diff --git a/TTS/encoder/utils/__pycache__/generic_utils.cpython-311.pyc b/TTS/encoder/utils/__pycache__/generic_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..68db90c1306626cd6db6d50f8ee2a2769c0c9ebe GIT binary patch literal 7424 zcmb_hUu+Xen(wyT{~gS^6U~grxI2l1|IyuH zCYGZ`D^9XbdB_8I!|pPYSDIzR!s;U3KJaiS@o+2U;ZEwd>MpI8kfL4jFuE5bcsT9T ze${Qay9vzhNZVUH|`Aru~KjN8Np z$$nlm(WbD*138-3BC)LHX>+5^nz?9(FUM(0uXT}t64<6b`fvXhTh0@ojjd6G^Gz-eMM!QC?NIe8UJ=T0h66N$c~ zR^}`&mvt64>WgODVzh;(o>+~XHA@P6NVC3Zfj)n=W-VIFO$|wn9LTg8C6qml+D6V` zr?<{G_L8C20SMo&t;8Mw2gsiBc~zr9lo|2tpH!I@?CWc z34(U;2OFiL#n}3n1ntz@`Ag!PJ!7ZLdu^O(*WWtt*?)pr?`qf&Qs(JQ7B?9IWfmr^cv(<10ie z@61Kno}lRzpSsP+E-tr{rMY-M!7t4f?0tQMeX%bSw;@_OSSGo`a<~z};-y+R8YxE3 zMy_+I>|$iRCLl7COL0u37W)!>DwiFNe3{~xB89d{VueoSxJW9?@ri7b;UWucE)$W> zY>JibKoqToUY*!$)7QR7V)Gt}(N@`%$jdD&922kiDW1h9$<$3|m6L-;ZakUGE~FOa zz#D9SHP3LeGX;H9ZIvlHmE>h>BA;inv`j6gb2nsDj*}_uDVbW%rLwY9(G*XoI9|4~ zOg^1RGO~rsr&GLa=9s)}$+M{}j$nUSM&kko|^cHEB&+uM{KA-~rjA@l-W-7+po9>`<+zQ!IyLXQcN0 z`SRd(j$ygM)daURc%I>I^11xrL@u+idVP8@xx(?e47iwPxWT!(*+E9#UI$nBRGJ%H zWU>sKO2(B^EWaw-?E6tW-Kbp?g+ zz)tvBC45W_A4lQig6p{GIxa=Wp3bA_ROvi&AC@{JsPp938tS}Ix*~ZyUlMIiEmB`> zgF?O|y8}uZl{yZfj**>?(Mrdt*fEAW#;Sy=`K&@VCZ4x-l@Forv8S`Bd;AY4#I`Ba zHdQ6a;8`iui9*MALZ>RBQ&0V3=nM*-5khDF|3z2Nx8dInKY8z8$9_BZwCPzv?7xQk zuL+;u5W13LR}yt4H$1!Dhc>8t-rY#7O4yn|BvraGB}IC7B14tP(AK_ZCNVOBA`?52 z=}Kh!zq|kIuo(F+%VKBRheCN?lV1oBON#hiTj1SS!hZA$LzDff9@C0{0N=-5fs%|bkoR93vPPX?%uHvRO|zyJ%;Qt zq5i&tSw$o@qyPnshVbvz&wv01BbFgmAGQffT3d+L8Y;=!6lA28p`C`R*iI;EHDwLE z)TQE-Cf|(HIR4__Bh!p*gHRVAzdjkYus&$a;;l<=uC4b*Tw(FH%eDfEl57to9JV#W z4q)~nAca2Ny80_2QC&sn^nx&?RlZqE@V3k}v`rI3yh z$?b!x9e1?ij&4Rp_X*@aA=KZh8CodK*cb>lAYU9+6tA}cg;N2^Ykme~rsWM>&8HaM z7C?g0hqSo>LXErvoIcjfybd)suq2F9{cHlryMYM|tuaDZ9%!ve(YK~`DE6IgEDh;K z^k*!sQAJY&tJ2oSS8EoU08~idF*^BnKTTryzR1^TNJZJDw15LimLpq%L!#%X-_7mC)Rs{nE!|G&U(1^o3^gb!yulX3KCiK3}! z#d8>tm}BNUJh0#4Ilp*xqVxa$c(VNi*^^0SXl7dz#Qq^;;|u_$}ownY-JN!I+uZ?aUzpXGi(Q_0;mE^z$Gk(qJUSH z5_v{N74~#4LB|0h04`KMh_e7m0J>yT3pq9e0OS_S>mY?_Hw%@y4ChyIZ98xe`a!SDiRl?`!RY>WA6yEosZ6_S9grl21Tjzg2iB3!k;iwp% zLgA_H_U-6)I||R0F70~5$a|nnB5!y3lZrPccw>?;^dhwXfk$V`)=!adQdd@O1=GUV zw6Lc%jW_l7Z_YiK`t`MMKi-^w+WhD>^UcPPN2>Sv2CI>^*q%6z*aVG z+P6-Lz3)FwiM^BC!{U)EVq``PT}7d*!nIGu(5IzouuSlFN&60OI5(WTzV`bm(RT#r zt5(9@D!BGZ{jD!8iG2- zYe#s{-?)s>bG2YyglnX|fyQVb4n}aXqC40%LOcW(xWZ{1^7R|k6N~Y>48p%jAO0cm zSZ9^o$1{NH;p9gtX%mar3wstda7 z_(1UN7d;;$&xa+mZ7H*dY)ILxTsH#h{QK#^fiM;6NHR z>qc!77#udD9&2R>oQ%_}@m!WE1aw2Baq&$sHx1+t@f=6d@(FbCv=|&k!BL2OuHgM} z#nmmix|NV{u;M>hz99PhkiSoG_w9Q9JKpXJ#JADS7129_yd#2rMDg+r9E0IZnN4Jv zcwBbI@tj>rUlWk>8Q_QIaww464d~#7S@$|9buiw-O8tU!X?DEc(D#)}|A**Pe%vd#sgcQ*A1J!Rx{tOut48-7T?!Z$!S0_Bym{Xs^Sw zdj8udFm0pl#%T%PP<16w91kJjK8BzAtdZHfdYM1i3(lGs5^vmA*A>8jA{uDipy?f6 zct@{XZ|Rk$;~n38zbD)eVcZ^TW4A^Y@qa#=W@IN0)oEop$W)r+GqNL<&9CrrI+c;_ zc{aDK=5Upg^I~MFl+uY+xaE;O_*#n3vB{-4muHd}qb7C(RtP{EW_hg00G2|YWI=WCtY^P(m(lIO?pLlj%?6{0N zE=%D94?3$&wr1x`xXb}O2c%>vSpd;FeaO*y|5N1X+i?t497CdG7&(TwmMV_3g5&IS z-yx~B6SYP+`%&vjDcFI6y_@YQctYjwv>vUr9^Jh8bVO_&N3G-EtKMbB>VW{9#OkXa z!rgr5BPE#ByvTk5o{4(cHb~eICd%e*fg#VYD*H5m{bUOAIIhTTaD5l6xi_ZzKZa+9 zI!@uY)T)z!#RDf>b2pZmB(H=|WmwqHu||9epaj=H+s8mAgyyp>9xxoGxPJoz7X~CL z5w1JRpF}wBD1Q>sbVvEyC0c~~D-j{V`0f%X1>-9bqk{3RS{F$YBJbPjM(azZ=C!_l P&fG#aSBZBJrBi Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}" + ) + + self.use_rir = False + + if "rir" in augmentation_config.keys(): + self.rir_config = augmentation_config["rir"] + if self.rir_config["rir_path"]: + self.rir_files = glob.glob(os.path.join(self.rir_config["rir_path"], "**/*.wav"), recursive=True) + self.use_rir = True + + print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances") + + self.create_augmentation_global_list() + + def create_augmentation_global_list(self): + if self.use_additive_noise: + self.global_noise_list = self.additive_noise_types + else: + self.global_noise_list = [] + if self.use_rir: + self.global_noise_list.append("RIR_AUG") + + def additive_noise(self, noise_type, audio): + clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4) + + noise_list = random.sample( + self.noise_list[noise_type], + random.randint( + self.additive_noise_config[noise_type]["min_num_noises"], + self.additive_noise_config[noise_type]["max_num_noises"], + ), + ) + + audio_len = audio.shape[0] + noises_wav = None + for noise in noise_list: + noiseaudio = self.ap.load_wav(noise, sr=self.ap.sample_rate)[:audio_len] + + if noiseaudio.shape[0] < audio_len: + continue + + noise_snr = random.uniform( + self.additive_noise_config[noise_type]["min_snr_in_db"], + self.additive_noise_config[noise_type]["max_num_noises"], + ) + noise_db = 10 * np.log10(np.mean(noiseaudio**2) + 1e-4) + noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio + + if noises_wav is None: + noises_wav = noise_wav + else: + noises_wav += noise_wav + + # if all possible files is less than audio, choose other files + if noises_wav is None: + return self.additive_noise(noise_type, audio) + + return audio + noises_wav + + def reverberate(self, audio): + audio_len = audio.shape[0] + + rir_file = random.choice(self.rir_files) + rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate) + rir = rir / np.sqrt(np.sum(rir**2)) + return signal.convolve(audio, rir, mode=self.rir_config["conv_mode"])[:audio_len] + + def apply_one(self, audio): + noise_type = random.choice(self.global_noise_list) + if noise_type == "RIR_AUG": + return self.reverberate(audio) + + return self.additive_noise(noise_type, audio) + + +def setup_encoder_model(config: "Coqpit"): + if config.model_params["model_name"].lower() == "lstm": + model = LSTMSpeakerEncoder( + config.model_params["input_dim"], + config.model_params["proj_dim"], + config.model_params["lstm_dim"], + config.model_params["num_lstm_layers"], + use_torch_spec=config.model_params.get("use_torch_spec", False), + audio_config=config.audio, + ) + elif config.model_params["model_name"].lower() == "resnet": + model = ResNetSpeakerEncoder( + input_dim=config.model_params["input_dim"], + proj_dim=config.model_params["proj_dim"], + log_input=config.model_params.get("log_input", False), + use_torch_spec=config.model_params.get("use_torch_spec", False), + audio_config=config.audio, + ) + return model diff --git a/TTS/encoder/utils/prepare_voxceleb.py b/TTS/encoder/utils/prepare_voxceleb.py new file mode 100644 index 0000000..b93baf9 --- /dev/null +++ b/TTS/encoder/utils/prepare_voxceleb.py @@ -0,0 +1,219 @@ +# coding=utf-8 +# Copyright (C) 2020 ATHENA AUTHORS; Yiping Peng; Ne Luo +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# Only support eager mode and TF>=2.0.0 +# pylint: disable=no-member, invalid-name, relative-beyond-top-level +# pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes +""" voxceleb 1 & 2 """ + +import hashlib +import os +import subprocess +import sys +import zipfile + +import pandas +import soundfile as sf +from absl import logging + +SUBSETS = { + "vox1_dev_wav": [ + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa", + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partab", + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partac", + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partad", + ], + "vox1_test_wav": ["https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip"], + "vox2_dev_aac": [ + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaa", + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partab", + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partac", + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partad", + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partae", + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaf", + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partag", + "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partah", + ], + "vox2_test_aac": ["https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_test_aac.zip"], +} + +MD5SUM = { + "vox1_dev_wav": "ae63e55b951748cc486645f532ba230b", + "vox2_dev_aac": "bbc063c46078a602ca71605645c2a402", + "vox1_test_wav": "185fdc63c3c739954633d50379a3d102", + "vox2_test_aac": "0d2b3ea430a821c33263b5ea37ede312", +} + +USER = {"user": "", "password": ""} + +speaker_id_dict = {} + + +def download_and_extract(directory, subset, urls): + """Download and extract the given split of dataset. + + Args: + directory: the directory where to put the downloaded data. + subset: subset name of the corpus. + urls: the list of urls to download the data file. + """ + os.makedirs(directory, exist_ok=True) + + try: + for url in urls: + zip_filepath = os.path.join(directory, url.split("/")[-1]) + if os.path.exists(zip_filepath): + continue + logging.info("Downloading %s to %s" % (url, zip_filepath)) + subprocess.call( + "wget %s --user %s --password %s -O %s" % (url, USER["user"], USER["password"], zip_filepath), + shell=True, + ) + + statinfo = os.stat(zip_filepath) + logging.info("Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size)) + + # concatenate all parts into zip files + if ".zip" not in zip_filepath: + zip_filepath = "_".join(zip_filepath.split("_")[:-1]) + subprocess.call("cat %s* > %s.zip" % (zip_filepath, zip_filepath), shell=True) + zip_filepath += ".zip" + extract_path = zip_filepath.strip(".zip") + + # check zip file md5sum + with open(zip_filepath, "rb") as f_zip: + md5 = hashlib.md5(f_zip.read()).hexdigest() + if md5 != MD5SUM[subset]: + raise ValueError("md5sum of %s mismatch" % zip_filepath) + + with zipfile.ZipFile(zip_filepath, "r") as zfile: + zfile.extractall(directory) + extract_path_ori = os.path.join(directory, zfile.infolist()[0].filename) + subprocess.call("mv %s %s" % (extract_path_ori, extract_path), shell=True) + finally: + # os.remove(zip_filepath) + pass + + +def exec_cmd(cmd): + """Run a command in a subprocess. + Args: + cmd: command line to be executed. + Return: + int, the return code. + """ + try: + retcode = subprocess.call(cmd, shell=True) + if retcode < 0: + logging.info(f"Child was terminated by signal {retcode}") + except OSError as e: + logging.info(f"Execution failed: {e}") + retcode = -999 + return retcode + + +def decode_aac_with_ffmpeg(aac_file, wav_file): + """Decode a given AAC file into WAV using ffmpeg. + Args: + aac_file: file path to input AAC file. + wav_file: file path to output WAV file. + Return: + bool, True if success. + """ + cmd = f"ffmpeg -i {aac_file} {wav_file}" + logging.info(f"Decoding aac file using command line: {cmd}") + ret = exec_cmd(cmd) + if ret != 0: + logging.error(f"Failed to decode aac file with retcode {ret}") + logging.error("Please check your ffmpeg installation.") + return False + return True + + +def convert_audio_and_make_label(input_dir, subset, output_dir, output_file): + """Optionally convert AAC to WAV and make speaker labels. + Args: + input_dir: the directory which holds the input dataset. + subset: the name of the specified subset. e.g. vox1_dev_wav + output_dir: the directory to place the newly generated csv files. + output_file: the name of the newly generated csv file. e.g. vox1_dev_wav.csv + """ + + logging.info("Preprocessing audio and label for subset %s" % subset) + source_dir = os.path.join(input_dir, subset) + + files = [] + # Convert all AAC file into WAV format. At the same time, generate the csv + for root, _, filenames in os.walk(source_dir): + for filename in filenames: + name, ext = os.path.splitext(filename) + if ext.lower() == ".wav": + _, ext2 = os.path.splitext(name) + if ext2: + continue + wav_file = os.path.join(root, filename) + elif ext.lower() == ".m4a": + # Convert AAC to WAV. + aac_file = os.path.join(root, filename) + wav_file = aac_file + ".wav" + if not os.path.exists(wav_file): + if not decode_aac_with_ffmpeg(aac_file, wav_file): + raise RuntimeError("Audio decoding failed.") + else: + continue + speaker_name = root.split(os.path.sep)[-2] + if speaker_name not in speaker_id_dict: + num = len(speaker_id_dict) + speaker_id_dict[speaker_name] = num + # wav_filesize = os.path.getsize(wav_file) + wav_length = len(sf.read(wav_file)[0]) + files.append((os.path.abspath(wav_file), wav_length, speaker_id_dict[speaker_name], speaker_name)) + + # Write to CSV file which contains four columns: + # "wav_filename", "wav_length_ms", "speaker_id", "speaker_name". + csv_file_path = os.path.join(output_dir, output_file) + df = pandas.DataFrame(data=files, columns=["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"]) + df.to_csv(csv_file_path, index=False, sep="\t") + logging.info("Successfully generated csv file {}".format(csv_file_path)) + + +def processor(directory, subset, force_process): + """download and process""" + urls = SUBSETS + if subset not in urls: + raise ValueError(subset, "is not in voxceleb") + + subset_csv = os.path.join(directory, subset + ".csv") + if not force_process and os.path.exists(subset_csv): + return subset_csv + + logging.info("Downloading and process the voxceleb in %s", directory) + logging.info("Preparing subset %s", subset) + download_and_extract(directory, subset, urls[subset]) + convert_audio_and_make_label(directory, subset, directory, subset + ".csv") + logging.info("Finished downloading and processing") + return subset_csv + + +if __name__ == "__main__": + logging.set_verbosity(logging.INFO) + if len(sys.argv) != 4: + print("Usage: python prepare_data.py save_directory user password") + sys.exit() + + DIR, USER["user"], USER["password"] = sys.argv[1], sys.argv[2], sys.argv[3] + for SUBSET in SUBSETS: + processor(DIR, SUBSET, False) diff --git a/TTS/encoder/utils/training.py b/TTS/encoder/utils/training.py new file mode 100644 index 0000000..ff8f271 --- /dev/null +++ b/TTS/encoder/utils/training.py @@ -0,0 +1,99 @@ +import os +from dataclasses import dataclass, field + +from coqpit import Coqpit +from trainer import TrainerArgs, get_last_checkpoint +from trainer.io import copy_model_files +from trainer.logging import logger_factory +from trainer.logging.console_logger import ConsoleLogger + +from TTS.config import load_config, register_config +from TTS.tts.utils.text.characters import parse_symbols +from TTS.utils.generic_utils import get_experiment_folder_path, get_git_branch + + +@dataclass +class TrainArgs(TrainerArgs): + config_path: str = field(default=None, metadata={"help": "Path to the config file."}) + + +def getarguments(): + train_config = TrainArgs() + parser = train_config.init_argparse(arg_prefix="") + return parser + + +def process_args(args, config=None): + """Process parsed comand line arguments and initialize the config if not provided. + Args: + args (argparse.Namespace or dict like): Parsed input arguments. + config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None. + Returns: + c (TTS.utils.io.AttrDict): Config paramaters. + out_path (str): Path to save models and logging. + audio_path (str): Path to save generated test audios. + c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does + logging to the console. + dashboard_logger (WandbLogger or TensorboardLogger): Class that does the dashboard Logging + TODO: + - Interactive config definition. + """ + if isinstance(args, tuple): + args, coqpit_overrides = args + if args.continue_path: + # continue a previous training from its output folder + experiment_path = args.continue_path + args.config_path = os.path.join(args.continue_path, "config.json") + args.restore_path, best_model = get_last_checkpoint(args.continue_path) + if not args.best_path: + args.best_path = best_model + # init config if not already defined + if config is None: + if args.config_path: + # init from a file + config = load_config(args.config_path) + else: + # init from console args + from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel + + config_base = BaseTrainingConfig() + config_base.parse_known_args(coqpit_overrides) + config = register_config(config_base.model)() + # override values from command-line args + config.parse_known_args(coqpit_overrides, relaxed_parser=True) + experiment_path = args.continue_path + if not experiment_path: + experiment_path = get_experiment_folder_path(config.output_path, config.run_name) + audio_path = os.path.join(experiment_path, "test_audios") + config.output_log_path = experiment_path + # setup rank 0 process in distributed training + dashboard_logger = None + if args.rank == 0: + new_fields = {} + if args.restore_path: + new_fields["restore_path"] = args.restore_path + new_fields["github_branch"] = get_git_branch() + # if model characters are not set in the config file + # save the default set to the config file for future + # compatibility. + if config.has("characters") and config.characters is None: + used_characters = parse_symbols() + new_fields["characters"] = used_characters + copy_model_files(config, experiment_path, new_fields) + dashboard_logger = logger_factory(config, experiment_path) + c_logger = ConsoleLogger() + return config, experiment_path, audio_path, c_logger, dashboard_logger + + +def init_arguments(): + train_config = TrainArgs() + parser = train_config.init_argparse(arg_prefix="") + return parser + + +def init_training(config: Coqpit = None): + """Initialization of a training run.""" + parser = init_arguments() + args = parser.parse_known_args() + config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = process_args(args, config) + return args[0], config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger diff --git a/TTS/encoder/utils/visual.py b/TTS/encoder/utils/visual.py new file mode 100644 index 0000000..6575b86 --- /dev/null +++ b/TTS/encoder/utils/visual.py @@ -0,0 +1,50 @@ +import matplotlib +import matplotlib.pyplot as plt +import numpy as np +import umap + +matplotlib.use("Agg") + + +colormap = ( + np.array( + [ + [76, 255, 0], + [0, 127, 70], + [255, 0, 0], + [255, 217, 38], + [0, 135, 255], + [165, 0, 165], + [255, 167, 255], + [0, 255, 255], + [255, 96, 38], + [142, 76, 0], + [33, 0, 127], + [0, 0, 0], + [183, 183, 183], + ], + dtype=float, + ) + / 255 +) + + +def plot_embeddings(embeddings, num_classes_in_batch): + num_utter_per_class = embeddings.shape[0] // num_classes_in_batch + + # if necessary get just the first 10 classes + if num_classes_in_batch > 10: + num_classes_in_batch = 10 + embeddings = embeddings[: num_classes_in_batch * num_utter_per_class] + + model = umap.UMAP() + projection = model.fit_transform(embeddings) + ground_truth = np.repeat(np.arange(num_classes_in_batch), num_utter_per_class) + colors = [colormap[i] for i in ground_truth] + fig, ax = plt.subplots(figsize=(16, 10)) + _ = ax.scatter(projection[:, 0], projection[:, 1], c=colors) + plt.gca().set_aspect("equal", "datalim") + plt.title("UMAP projection") + plt.tight_layout() + plt.savefig("umap") + return fig diff --git a/TTS/model.py b/TTS/model.py new file mode 100644 index 0000000..ae6be7b --- /dev/null +++ b/TTS/model.py @@ -0,0 +1,59 @@ +from abc import abstractmethod +from typing import Dict + +import torch +from coqpit import Coqpit +from trainer import TrainerModel + +# pylint: skip-file + + +class BaseTrainerModel(TrainerModel): + """BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS. + + Every new 🐸TTS model must inherit it. + """ + + @staticmethod + @abstractmethod + def init_from_config(config: Coqpit): + """Init the model and all its attributes from the given config. + + Override this depending on your model. + """ + ... + + @abstractmethod + def inference(self, input: torch.Tensor, aux_input={}) -> Dict: + """Forward pass for inference. + + It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs``` + is considered to be the main output and you can add any other auxiliary outputs as you want. + + We don't use `*kwargs` since it is problematic with the TorchScript API. + + Args: + input (torch.Tensor): [description] + aux_input (Dict): Auxiliary inputs like speaker embeddings, durations etc. + + Returns: + Dict: [description] + """ + outputs_dict = {"model_outputs": None} + ... + return outputs_dict + + @abstractmethod + def load_checkpoint( + self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True, cache=False + ) -> None: + """Load a model checkpoint gile and get ready for training or inference. + + Args: + config (Coqpit): Model configuration. + checkpoint_path (str): Path to the model checkpoint file. + eval (bool, optional): If true, init model for inference else for training. Defaults to False. + strict (bool, optional): Match all checkpoint keys to model's keys. Defaults to True. + cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False. + """ + ... diff --git a/TTS/utils/__init__.py b/TTS/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/TTS/utils/__pycache__/__init__.cpython-311.pyc b/TTS/utils/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4abc14cc33e04c0244dbc6bae8d57efd563d9fa2 GIT binary patch literal 171 zcmZ3^%ge<81nFs6X(0MBh=2h`DC095kTIPhg&~+hlhJP_LlF~@{~09t%T7PEIJKx) zzcR5nL*FH}IJ+djK;Jn(H?1<%Q$M-1xFkO}J}*BdwOBtSBv`++Br~U2KR!M)FS8^* pUaz3?7l%!5eoARhs$CH)&^VBV#r#0x12ZEd;|B&9QN#=s0|2UBDBJ)5 literal 0 HcmV?d00001 diff --git a/TTS/utils/__pycache__/generic_utils.cpython-311.pyc b/TTS/utils/__pycache__/generic_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6b8a83cfe71594e9b6df05b2c86cadff275c36c0 GIT binary patch literal 15661 zcmb_@e{36PcHj&-{QaAjWLdJEkz~oDZAq4`IJRP2kuCqWWhGWpN4Cq*oRLJCBAJEPA2(X%FVPS9g%HHMnl#9FTUbk0wv3qs)f?f)==nOGe5rY%zqFA*1PtU#d zfFL;F?!9kFibKhU)4Pu|Z)U!q?|a|--h1EsKK^@;$4Nn$7@r#d>z63%@9?7>tfiN) zUZW}MKE+WS9jC_hCp|`!Z_}8GJk4Wf@?^#s^0bUu$kRGzg{LWQo3@YHX)J4wJEomu zPV!F2U1Khi&pqbm7~VbQDL2kps=vk=&iVl}=H+Z3P-8yM4$#jz00uZGz#!)WxQTNE ztm8ZY>p3sq!1>^|@SDf2oF7Uzk~sWkxBz@_;ez~TNN<4eP4L~s)j`Q-t{&i4u7P)Q zo8h+bPAK09>Dx@yDT>?j0mU`_$V5?}!L5g}?OZdId5JeovE?{!E4Ovr%x(LC9&6#~ zOH^q4uQ6#jL~G1QM4H6sP%J8G^exRQ@KRbxM1jneV1_rmzxp!(_o*zE143V-q^dkq zmG><5BY6L;@-3XsQuzvcl{(?&M{s{uiI}6L$~#3(VS6Oxnetop(^yt$fHo>6s`GL5 z1CtS-Ilv!CU>+WYZo3o7MEH6<8NCrpOoS|&MM?_MNzD-x!xEnmlY-`qCKFO@BArZ& z8WT-(5zP&Wk(o#=9=RUpL$qLl_C)M1`_1v5H$`3$duAizWX}*U-jI^1p5w{s@!2=e z^hDF5l$;JHk{mDgjE-LFNlUS~*fYT=cp(-Ildta7tY(vvVQfh7L7jfA{A~a^s%W$4 zW{bW}^U|u#{lV?UcNN+_}?VLFH5wK{+77$#DM|UhW%*r82B9IHZ;Q z)kA6+=t_cu7yHhtDQVJ_#U4t@`Q~GAq80g0DmIb*jG3-04+M zaN+8|<3E{N={Qp8IHGh6s2u}x)7hoYrOwB1DxPzy=Unb&(Y0yuE!ov9GtB~yse#~D zh#&+X0%F{!9M70p9CUo5rt2FMjWgHsykvtmN`YfOfH|=7yy7gzl0!AK)R;O)XX!lh z|GL0gIh#>1(Ntnzn~|BY-z4my1PcgZ`yBl?m3T?QF(?HIiB$^YCdlWWGjR^g$5C^p z_e=@9{=7-wq;8qtrfvd8zy{7?%{&p4H0yOCl7I=#q^0qL2APITz8~(dh5($SXd>6H zHIp8d+A}`58oBn9%~|>_>OM_V3tNd$HEHxN6sb;8iU^W;GX{dyq?x++YD_$q;Hw$z z8^>Xs_P0{P(M${S8sHk%QSCkq(kK!ak9JqGv;iU_;`$Hp%TaQ}2wZvQQQw~q{A}RM z{mQEs)mJY<)S{#(RYSsT8tu%OcCi|zd147M#_?eeBpxqlwshh~B6%|rasbxp>nR}_ z=kCBp&Cv5xwXMK>(Q< zI2nuc!%696GM(V`D9px*_n_)MxZ)iscn2O^74Iq4durai8f?&sv26Z{ z<3Bha?)cRAzy~o(!-xtnIHCqe=50m0@1F0jZ;4sDuGm{tdy7o2uUB35s;d)(r=@ib z6oW1EHvFx+xBYNVc6DLW)-_NJHtXp+{2IbjGvt8h6IU=dEN+J$yg#^mu$%h4J8*29 z`IodGVZd~((frFs3%na-1M(@64L^gIXUm2OFo^yQZJeDO%Lb$7%wyf_f-%n=mDpZ0 za}g^NBx17Z?__@BJT}cyFprtaj3oLtb9qvdQeuBk56q9~jpPh3jK`BVyQ9hJo_Eu{ zhiNPVwwpR}!tFQxd3hf9w0028nP=&AyBX$FT`D+D? zrel(>(sg~{Bl>_uhY<Sj=_|eZ;npXqOYM@gNjp&yWcwG&=z7n`x2wYYI zV`^Y*CGbum@QxC=t_H5pTj#A`|H|ELg9M#%~? z&U%U5hPo{O^14E{%(3IBb;~>)8%O39cs3@o5tbktwGmDVSrLX&l&pBA&diS@~NrQ4%$iFs)fq0^kJ&lEQRE0@d!T;$fT_ z!Wp2e=3nbWcpU5l(Wvnob2l$mIf5ZZWXWY+_B>3E%$GJ zbaUzU!$#$$L(1lUb#uSW)D>MIP%W0DMVIH^p}U6`!-}h2b+tdGOpa4@(e1r=;_ivX zp5@W`6N-DE>fR^2_pQ1Ea@g9CJi;b;>1I;6VK9+#!V(ktJiJs)<-AoGU^-H%5OkMu@^weR`#Bjy{DI6e?2QTyQ-n)7C=AGGvS@@{A$S?s~ zW-&djIoBG2sbnFNz1kq~*PM&zg;LF#9f9Owht*z)7d9AlK1 z&&g}ytn2dv=h=oc{?37x-wcI2uSB{sgI#0SLhCqd6Lje-q}a2&SZi*)2%jP25C0k& z62{UpAtci&&5}m(1p`S>R}JDf_QUbW^mQ(BG}DAG7nl$6Dvm0p{RY$|ZUvB|9tQq+ z^MBm@DDVeOKW$p+9W3+?D!s?m-s3rn45DTp1&L?|&yHrAiE5^|j)xd_qRV%0;O>Cp+N!#?=7v^Xo`nN) zuwCAH9InTsitDWEI-489Y=`e2Ua~Buby2q##Vm1(t4(#ak*M4`9b*?Tb~?`M_DlH# z4H$z_$p8+>I$!+&0E`x$hv)iG!;tjG3|UtA*$vfxwK`f^GkYs%Irb~5tc z(B0_9gM|g?D@SZP1&e@q>^dxb$!l2eFrlwz1JxFhBmveBai~?82B2B>958V4h$s?k zPDBE%nQ5-^Sb{58Gy_T3;1e-RM@^HV>e&A8F z(mtekPN<#}xs$7bElazVK!+OW$emqv1s4x2^((Fp)zu+09nX;pPoSX}pE8_j3*^_s z9L+sLR6k3`)J{N<4yNC#s?lf_fd{u@Z3y}{BEZc2W3#05c|m+epN9v+X?n}sGl8$H9%D{q6cwVfYp}jc!T)7tEx#} zpW>sEURLKEo-3EYW`rb=^w<9f!f?nUbi#*l6v4X)1Oy_0kXdJ}G)AD)(`fRzOFT(1 z5oRHs!#oIJNg1SomjbPYKx=-#64SI?Lc%6ClsAml} zuu1huS6afxOI%VJbC4A{&>*{KQ@rL3m*hY=tl7gvScJo{C4eiMYQawY2c^*_7V&M& zfOi)FGT3h4y@PiTes6TC?YH0l_-(a5^!t4e&7U56a7f*KM6NrkxCT_$fXoc6mkbP4 zK?ckZux8C)G0a|YJx_ld*Zev+yeUh6_rUW+cphx;FnG+q=bxc+0STpuj zEAur9a4utubLOl$zhTZ=K?H)tJ*?TmBnzLuc;STRxjb|#JTy2uNIp-34Q7SOk2{~1 zo3X_HeFBbS=Cw=7ap`76;ICeYMTMjYUsrzvbbj)tc=fpMuzm{`PT;S+dg4|po)mcD z>ZM6O9%oMy|C5;UoJSu%JD5rhMWhIL&?B&5Ph_^g*Kfv?k2wf9&r*H4|xT2_-HmhF%uJ#3E0r!B>^Y8=7a?{II>{>3S9WGfJ(+^(6XEr zH9I)vC0vMWR%lS*Cp6o|6rVW9&uW3w=T5vGKK|w#Z=4t&4ZnHm#2cFX^)xTcz7>h5 z`4hJ^W-^JMD|Bq3i~=8k4VL|A4GQ0dR)zPmX>%kY3Jc`z22{#$F(ItcVG&(@C?qlH zKGOg?4iLpKxYQzH$SB;0OhodX0l9>mX@P~zz4z|Ew=|{rLaHwWPAl+KoxMN&(Qtl1 zsoSsC?FW~X%_rNMSG~c-gLe`Oi52h8f_JCl4XNJHinq7m?R``)dwUh{fa)EXH_w}k zjt1G$iusQy-WJu{vf|xS@a|E(-Kw{H#e1OOJ@AN^y$2NUkm?=65~~jP!q%nD1qUlT z*rL6D-h;m<-nx5}B<~*P3c9I9g;!3uKiy?N_{gs<%(yxlp`=s&^1G z{>mF%h~2rdaAU>WTJW~!k1F1Ms&^lB&*ddHzD;qls*9BwR@Vr~80CP2K{P@?@bib% zht?N0+J~rXyNPTugA6m@p=M~IwOWOPF@Q5w)YMs&@8XZ(U4vOV+EIIcwPFSdB7 z^=-^qNO?1qx8&EgjuA#Vvc7!Ql&_{xr4Cr_8ZZ}nv!Mx*9>s@fkLr*OK za!eN5?ob=c=4{R0D|jlaaOSKV)4tjRSzAMO+MSyHOQS|HQh}!K;GPYvz%Uucu%a zRRn2+h=8DpN>$7FN|t{GHH(PJ*SLi`s$ajy zFAF*w-0N`0N8q_`*h;d*Bz{^HE+F2+h_@NG4h3vfv!+wH&!m~-d?I8M{x!x``^RC2 z6jJGERi}L^IGBa*Tb?54e$;#PtW(lnV`k9vTHkB#2J2UC3vO~Jo6=NCY@J<=N0z_ z)qO#BU-O8p8d9=`ZROx(8?R*Vll!j5YVKjH~3E?Z9yOw<`okt3tN0iP1 zwQ~UCl!kG&VH`5q>>sAzPtS)H+cwp<4bhpP^DUqB$gM|;Ev=ut1pA_n2|yE}?A}%^ z4;BpBL*L5Y6NSAel)b0ay{8~T**>mrAD_QCfAI<9U191AO#Na~VLDU>9V@GhLw0tp zIQt6DzCXJ3=q2UlGwRD{{&N49nLi)+ivh)XMRi_Ln5!ytRc5XhUG*|kuQRtD1YI}+ zz6Q;D18lz5zlI$c{hOp0>z6A%=gjo>QaE3#~1R}4?9uK3}q}(^Qa|om-ES(2g zl$#*bu?k#t3lrd-$GrqF2Vp}BcTp$#n1JFxl8#I43>uNJmoUyIl9H|zz+(cM;vF5c zce*Psni8sOO}&F{y~8GjN@l1$f_qWXcp47NZPb~H0(#b>9y`)s!CTa~gY_*c2{5an zkB%MsWTVN{Eck660G);32LQ)ju*;~3VV3EW%&*rIhDt!^0DXl&faGPk#Zv%aJzqzG z^?2p(l{?oKuEF6iemaDF63VvMSI7nTDW1ZEuW;xU_Ke`jEBN{0z^fUcg>!+0Lc?)aHyl!>$86+~ zs+}BCb#PA5>>kZ^j^|T@a0nwZ!5c(1HiEtVs+BZ?Iy($+dLUoGb$YJ4}B958-0~)q6~uD;$O;3kvOUIJ4P^ zUM1l_V<|j^0T}u=E!1Ye-632`67yJjat4Lkvkc(lv_IiYOAm1dG7!f`s+Wa_K?QHlNnDiT=$ z&=K&M1_tdjw0HE#UU9^ah%!1QfIn zYMjz|M*I>`KzNy}q0E{=mw^^JGaO)nC%OQu+taB9F%<^R(zi^vs>D0TWWcErg#$fh zx*E%`wPUPhEgsv|kIxuOlCgW?mW^eNI6kxGMkD4S%kZbP@TWk7i65W(+f3b$|Hr_@ zck1xRVfY_BjzAeKBi>u>_+kbEP?(kCIa{K(S;@+#e_sZo9R`-55(P#Da7d!VX zOAl^8lGL6drE}=9^dnMF5DIhU&3hi~nmZbqoNEY*X9s_I$rbY1*we?M5qc zF9CPX&L3Z_FP27n|8hN!bdT!pk;w%o>q;4)L$hyV&jHPvUxa4Hl{Yy*TvnuIa{U?f zt4uB@nGDaTm#3QN#mts!k{OaTq}ou4YCBe%b`_d-DNTFSraee9uuuS0=(ghCR&Z}y zisoNd+#xurD3c56re`E{gQFZonGg`7PllYrdr-eH2S9U&!|$deaWddx0oA+ka|9Kh z5U{7hpCkAS1m8#CM}UXGg+D9<)GFeK0CJVzQ3oe))c34rfuxisoi-c zGb-b0pKYJUQ;VnJ#Ns88p@*Oup&!Ak2#nOzJm`YO(~Gd4oK6XU1zG+E?$U9FRoAA4 z!*`A@9M%6S0v=b_e$};KX7-cLos3!`Z#mfEg;)n(&|lB$Sgreg&RTu6X-)Itp_JNV zd2_a`HIFT<3$@Ngn#x*)_rKNQQqbqmI_M__)*3VpjL{{Qfo#T!J7+iauCn2iwR4sS zwFhGySx3II^|RI@=xdv?9q66)8QVaPbvYSfz<|RuwpHewS?B+A>owaxx;;chJ-7+` z+B+j6EEuQxkl5YcIW6*0F|>C-$9A#popFAKCn6vu_QNNzqX>(RkeCUy&vvy>!!ca8 z{d9l(g??aZTC<^NWPDo6*uvkLgAI`hjlPKsGhDvHy2S=D6X1o^9KhV?;T_L;=!;-5 zFX6y0#A177^F%lTq}F%laYB+Apto<0jZ5>?K77o=<32DTxOhHB%h0EH60$H7HL`KEgT5NzEiq zin!Yci^_7)7ng`pIu+I%6aF5u{2kol{{#SACJsLyetcDN>{J~)bH`Twn^k{{+ zghR2ns`ggd-ik4vyPie(^TZa_zD2fgA&r@#F|43f@$7&jFfUdT3d8l(28Ho|ZB0ZH zg<5qvM-Er0KnHzn(VP)>Ah-)av*D%{9QDz+$gHqG4_qQJn2w8b@;@-mI!gZ4NB&wT z)MNBs1P2fdA$T3Z6$E$$3pUl@2)9)~J|N6s^elqg2yi!xIM~SQUsqKqVm0Q>@X3p0 zT zDOCZD9y<7xssPqP!(S6C!0(`sV&)QPG|^|!?YIV=7TS;bOTf?2JC`m2%hEfSMe=w! zLmsuj>xPU`(BkyY{5W~kg6$6aHF~K5TYrttAJ?B7z{X8<%TuZX_V{TiQvn@bI=Cb~ zrQoqVN*)h6@~8!^Z8TU%>%l)^+&`}+oQE;*0|TWD?9{$PKi&G6mq*{m9svJ81CTlK Fe*vm!te*e? literal 0 HcmV?d00001 diff --git a/TTS/utils/__pycache__/io.cpython-311.pyc b/TTS/utils/__pycache__/io.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..decdb5608a45a46c79f59a6c95865684d5a80310 GIT binary patch literal 4685 zcmcH+TWl29_1>MCec9_>KLCS)48~9vIqRemYS~f}W8#DWM#Z)imPV8D%w2oz*`3Xu zSv$7Yl2swqM9$-*6_(mcRzx8YRLv(JtwbIl{ai;IWi^r|q<-~pk%@%z)pPIcGY0xn z@64RJk8{pF_nvdlIrnxX5=Ky_)3fRAAVPmB@zG`E;A1c3w%_b;?bp)G?Q^G1rW8HTxm=agfyErAm zw1V4tnPkJ5zrRaDuIKB<|DDHQIio%3e6gR9W-ZG%xt|;&1RkJ!0 zP!!$L6ot#;Mykuxh-oUyr3!=K7aapf;xR z4zaanW0XL;D16(m17Jq%dxja$)+r0vKn34T`~k+O0FP2PRULtG;<30unOkJ4GO~sP z>VX;)S5e|3jj3U!- zPzv_04-|v(jiUg(x7@pbRa}d#!o!si_ftkn_b|}Q0P|jy%|iDv&#qX02RM5W%Jab@7CtNY7Ix}v2M$IXvrKt{hKv&XKZHBmhN}N2+ zw)h)nqRcwCOP~huHc+jVhGL%VQ3~i5(A=r3FL&@M)Xn~_#vIh!O1&%=d^BAUZQWrq zRm&_hC0=w~K)*wO@aaCM78;@7VP;k>y3oq8wPj#kyeqW^v=eAqa_Zp=q1pP(1qr;q z9}E%~1ZI$$PYoS(AEJ9(02>oZL|A)j+( z!}jQOIiJ;Gm2dAz2#&>vAk-sK-Spo>aJbpr>|+_E=|o6aJ3NfD3BOgzJK0KOKogn-rAFHgjW=Xs7^yz ztxM|q=v3V2hU&0XPz4@v!T&Y;Q-~{CIg)h+*l-%) zXk2JlWL5#5jgfL*^a!iu9XFr?RJen*WO|HCN4zyQLNE*c-z3Kh8 zBHe3O%k46_;&vI{@}8rA622My`MtT#!8b~SZ)_erUpjWaxcfr6b5|uGc7z`y0FM}` zASpD8D?!u|-RkcBO>$i=cE1YmCnkV&|9mpcEMZe-A=_f}bIwz%V?#X84d(KH{*s|7mqD;geZ4li19BK0n!W zW}ooIKJm=Kz!!%wgQt2%4hUZ!5JwILz8t^+N42_p1`1|D%++r%fxd6RO`$n;EntU@ zRm*H6vg1}aT@dg3TAlQ=ls$hKIn30mWdPCB7&&lbsuhRdqr8?<$NX>RZ{`HD2)7`6 zxC5V8mZd8wD=r|<*%#0TTl0=mko2ZIOzZj}v+>a$plp)9!Jsw42V8;U*27W*6QLcr z2!fUm@+4`@6|`JF?rR)fNSv5X-H3N5siwF5RJ#)M7-l@o&ntb2q4_;u>~>$|HEul6lnxfATZNJPQ6>o6A{7 z)=_w_5`>$X!tf}|hN?nw9y|Pt$G^dVjsnF=`0TF$te}ctl;-hPtZ)6&UyuA{@P2+X zd9jqdSe#51V^=q0S4*+0tJ13Uc&qIo%dO_&Of}%vb1%FfE(Q1RAphpA>7wNnWBJWk zz7)$d4y#f{VwCJe`H>XTN;q1(gK@R0C%0t1Zu#wOyz=;&yC7kwVaU4Qj% zd73wz#T;BkDLY@3nJQ(clCon!UtoaibPrso68t)qm?l|54Nc)K?)d4;43C+aE3$l= zC#dwttn(D_oU$y_GZ41ppCT{gWS5-pgQvX9(rJ1Tnk*Z%=Kxj&jBy!-R`^p!eZ|^a zM$Z@j)%yTNR`^p!?JN9wfKC)!Ur;t?SEnb#^>C?o5F!DfKqzP+cK9*m)jPqe^N{sE zY7Fg06wl*w*X~N74>R}6U^yJA2(0p;YtOo{{^myhX5q7mdq?gI_b0#lW$~3?l#-W= z6IY60Lg<+HdV NL~w5fJq2)O{1Z#tOcDS9 literal 0 HcmV?d00001 diff --git a/TTS/utils/__pycache__/manage.cpython-311.pyc b/TTS/utils/__pycache__/manage.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b154639857141ef4d9298daef9839c3a410bc12 GIT binary patch literal 36630 zcmeIbdvF^^nkNVl08tlgh>ME>lV4TkJF_ys{PKHb{mgDJ(cyP%?9Q0?EuHRv zp@+}nV>u&Z&zh-*>UG*LWKtf#n!uz7+T z;U+927WQlmmrPhktP{2o+k}0@&iwS@(veb@-ZA39)f9G4I7gf;tTWQ87}% zp3Py`MCC{&?zy0a{gni*__GD=!O{;cBUMGZ6S|<|9bM2Fs{W}#r~3u|<===qsK2Od zFZ&fe4!7%5##aN;+wDauZggt;;n;XMlq%*YqI@XSZb%si$48?n;F-&)BQt? z-H?vPE5@pR#9+|{+vap*`k?V03r4Hy9o>j2*d8>$qaP^_a(FTaEr49G1ke(+0+s}A zfYzWL&=xEOw9^Q~m~{d=Le5}WsBEk#SpJUjrzrXtN?k_EDI{1Csu(i{UGEr2T&m|v zJXZ#*kVBP{Lv>yb)j>C6xDlh~9bHcDHSn#?^Q{fmAw^x#1Lz6X1J)xR>JV%QHH4@x zMKwwdQLim|Wo}X8Q@y=;;a-+oBjp&>j}-^Eq9u(;)09`2t?+Hm^KF9fwmjcv__pNv zZi8=Yo^Q*X?ty-!6|=b`Wji|+4291ICIh!Z{P6Khqb8+wI-L>sgaCg1Hip6v_3!JV zxMqIZ?o6mcGBI>HU9Q)~4RM{nC=(tx;z>a;@uEkSQA;LF2ZZ@kB4(iboYOvp;(2;oR1iI&pyKHO4{Y6TP8lPi@R*XC#jQnL zo;rl58N+)hsm1Rf4F^y$e}rC|m;D|+>+@Yk{UW}Hfyiy&U?_4oIyLPZn3@=Sc=?oX zbS4s=n($Ae%@N1N3!(@8TZ>_ zoy+|Z_fPuY>z69pgw8&(qHk_68^2k!wMw>DA>)3HO6x|I5sL@vdo~-XQQEnfJ2Ek% zm3hOK?V{M&1uV2)^~muCBHnkl2!ApKeJy#!HgLnx}t0)cHFGRfU_JzmB{>!P4ZPRs}hy+<+K=z_iFmn5HCWF zjjRnO)&^6Y+F**K4Z-4%tlHFi-DJayr~jxVH+9^w3D&1R%q#yRb6)zQpg9KYJX`em z-#kt|-iRBpFml7OjuP*1U?Sulo8l(|(F5M-!|9MO9GJW{gQe0(BxxiR^)cl5tf;#? zrT4~)4tSsGeX&aKwLq9g@i#5=yP@EXU(t+gw=iUwGEMX2lTkKF_&WGA?8w*C z9gX%>ap(bP=SZqJFg+cb3?kW9@-AV;`Kc;tTnmDEW-^)y2ou3=q)MjD=^Ca?EKSNG zCmov!hy9aiSIU}M2r#!JwEod-XFoPOQ#L>4=4Z(wv6{0gI-y4$bR0<|3uxRUUDE1! z+_$0ATaM`0ZDm6FA<=eNvKl|hgQr|{mBiTr3?`O=8q-Ydxe(sa7$77 zx_n>0KG`N;=g+NgZT_V7XI($;T8v7*BP)l*trw-O7w3nP)phU1H+06z>-zP&`VaSh z-2XxUVy9TQN2=Sip)=IL+tVm)ol4vVfLj`ouPe>+^+~mS6)s*$-)x-{J=2nBT43%g zN!R+{qG7%p$*}XosE%SZ^ZV)1$c8k(1NSMXJj&R> zWMhlp3GWEuHyV-hfON^D4)akSlx!Nr8Vd|eZImmN)m{J$(7Jl87|E*YIct)0uW?&e zxh)Cfns>*lcgIr4@?Ft;TJoM2xgm)g61btHV~fCTk^4{G1B7i^|2@DB>pwmEuSkti zu}*Z;DzZt>p|7<)hc!i?P*xmdtNh}|++Ka$6USDZKZq#p1{sG?kMe!wVUoM`74o1? zWF5@+;x0nfrtVDlr&T?_-I{Jxg}gy2&jVep&IR%<#I z4=jC0tQnAM2HOqRl7Sd;)XoepR=b2vpXuuj0+i;}X3B&-`ERs(l^# z%o=8m7M*%6$?ZGzvN2e6m%U5AEUH>HgNEE*4jM1&CaartdYvX-9(8^dYu!cN1H)C_ zeSNS9D?ZT<3Vm@;$KO^z#|@gb{wmgbRcv)=yjT-+VAlL1G2>=Us#dJ?MG5p~{x5F8 z`lx4XvOaER_Z-m}x&06~1&i7GXnNfCc<}MaWTmD*%q$=7qk{d19iHhfE1lP-vM`FE zWRp~&d{8F#VA-}L@*4o3yitI%Fbaxi&9j`QmR73eM@6}9)5Nh+U;K_ZcIv6$5vMdx z{g_u)(DAbGFV1D#k8!j|Q(Nb^$t~!}=3J<*lMYQ;Wx4$pEdMs8qb++i`B!|K+=AuL zZwt*%`fqo|7G5Q)%NxBN@?tMJ%8yS+$EPN}Q)A4FtwG)~eriIqw_!UV?@VNT@|KrH z@s9@|Ja5M&M|x`}#6QgJnfheE=)+LIFMW?p`juT1--W@<_AoJDf4UTw0hQ&A+@6{V z2fa5#-hg*{Dl$%tPcIr00?VdxIM({y6q8C!whBM9XJQi(>3!w_i5iM{C#RxbMsgxH z>kL`}dSd(JK9So(RS7*94TXXc7G-i~;%11a-V59djE4g^!y&oq5%0BF(GBl2N4h9R zI(tDISND4}1hf}Ci|Hvo8VXWCS-&?;uA-x5FC{S%tB8}bs$SVMClaXMAhM{-^#R|O zURh*nYBJ>rhQF{_oWgz05w%PLPx*c~J5?p4G z_*s`^h-#{$Otvs;CQWK9FHl~GcTsk`3G4wVL~)m;hu7rTl>g>KdiAk}vud(CFMN!0 z`gd6R9_co0y1M!~N3vzdTFc(mmc7dcv87LH=@Yr8xsv&F8%1VI#d%@=Q&bk62)Hj0aojA;3TqODJ|^$E7V zq^*4J_}uYtzWGKbq^y+-E~%znu#!uh;XJFo#+|nd%@hz1|TMjM{ zN?VSAzAi%;!2F41<2I=g)=ttC!^)(S1d7UlH6_lJ2I&TcW#d!Mb4GFi}7s^j3IsjxR z+J+?CkYF2PO-ov97p|>Z+XQPHYx(sj*M#!xvJ-7LB-;%k#WZZ^9=X_m{Od`ZT z`mp(kLHAdNvLn?7&DMp&7>W3EK(KZ633%lH3K9lIL?D7dBSKvv=kMv}_56<*(Wdyt zRXlzc9wAnBHjO%5O|r4LN8`lwDVUc!u9{WE(rF2nB4VQC6A_wa)9?aB4G7-fnGa*n zPl24rFq%CDkICWZkKAE3aqJxzpr?P*Px9!P`?__l= zD?rktE$tPtFMdD7m%_k$K@yDd6f$(!amLH@jz_!-MK(<&S%xaXZJG{I=rmf}a|yLU zARPV}0b+jfCkUJ*aEicb0z(AO0K~Wq6%=#j(punCfMM7ZE6I{SF|$k~dHJK1!vMfA zHu-1ifmrH{Drqm4*?}_KM5aYpJLS!O4y8fd9=7dI@B6E@WQ5Y8vF7Ix8%nJ2Be%#~ z8Iq%`Z>{slYUh!aCb9Fl)OlRw+CXg#F0>|gC$n^c2)e*gF&~wxTNg{D>R!RoD>{0U z6}1aNsi9{vCN&&b-X%307Ag*l6^E161s61-dPuAuT6fnA4Sh=`0C1xF70LaI;C_X9 z_b(fThJHBFeMoX265NNF_raxBq2VB$=dGB2^2@QMUME5?)y-#rOOV+h5 z4oY2p%Og_P$tNzUYxv21sq1y&$_=UO4WW)4ID$#_{OOy9o1%ME za*qlb_m!(gsJ)W74*-Xh>`E!+o7yX)>#F3sDr6iZBTgiYATi;VJLIdH-~`PH_Z5+u z1<5i{W^nm@I3ffXF+kfVlJO*P&Hv>R13*Vbv9OGe0^zXtO$Zg=%xql(lR+;Ziq7zp z-hh{OLA1ePGsa7(Ad*EpS!D8N27hS90&gn&yHp88w`o@Tr+S{GTDWY*B|=MLdSW^l z+btV2=Fv>ZoKTW#Mo7F0+P_5SbgensRvm4N9?`K!a_kYfJ-^@nie~$Z_RnvocT#`H z{Z^VTCf#@ZRkW8Mp_Z%qt%CK8HfGG5-*?&=%ajYP#G3vS+V_i&1Crx_z#aJgcHnok z9e8gF>PlW{K|wknAua?PW01)Hos7R~Rval^rHgxRhnC76I+~6Yt1Hl#nZUo?q#KWl zjseLrAaDcu-AK|VHgDg-jaHJKJ6EAIq9Pl?Rd1Wst9^20v+mgh(bqQQoQenmS0?Au?!F5;f!S!}{ z1I!!Q=4{8ph)}fyZn0Uu0*cO^l5?lP+=hV?7?2OkEdSjp8?Zk^WnS{#9e(b++s-5? z*)4)f>aN~!VGPvH8a92)@}{UbmneQ*_jt$SBa9DdidAXfb(-Xh#G0P-f-|U{A4R*_ zdmiS4xK+!sc!l!H5nb8Y8&=Ax&40<6u>S^GM*cZK`aSoG+%AdRC2+ek@7Z8EtOw@= zFy~Dd*(rcvA%2U#j6w;{pIE0QygyM30Jj*FuS>7XSJ8Dyavc(w!*HN<29~!WTQnON z8L9JlG&C`)yf@UHC{8xN!|;XjD`qUVP>DG1vRj`7BIW;n3a?)%-jLN$K05w;YN-z9 zvnHoP8$xZ2!jd&@S!xXk)OMeyM58+N<@3(A8raBGV2i|I%f?zNH$9j)euKF&^5irW z1KOc)$zz+==(GkaOO|-aSM*`)kDuP9w~ds7pz6cCyv!@j7G^%O;#rosF{cYk zK?}>qRq*+6yAfR8D0u%2Kc`ILsryi#vz<9LaBTSEG5_TYXD)&^ zyNws2p)saObRaK?VD&Oh;~W~JK#YG+xqJ=)8lzNat6V67ju>1bw$v`Q_N}!ZU2Q!o zwhl_IgHMW9TTctErx};;$YRIRRjK2UP<{kXv>lafM+Mu_q@!k`U2-(9IXYGy9ipR4 za&-ODCOG=w7(>N_p^&PZ@u&k9v<$WCS0 z&R2zuV}$rA(RNz0ofa~V&3Jwv+M-djA`vQ^>Q8W^T@K<9if0FY6kEp3Pb>Q7n?bm% zLNl>Q;ZmIgWg6?q&07FB(pj-hw*%%uPQZj`(Nz@TaC!$v9*{ zk!WCYGz1+R(zL1cXY_m01B3M3B24EoRh|utjZZU$$xqW{0F5oylUg#MDw`r*Hk)m+ zrn71pK;cMVAJ za!t7Knsi}Qv;`$wP{_Eaq{Ao+?FHW<;`MeuiAv|*6m0>?77#K{CJmJLfwKR9I~Iy> zkI@+5BlO}(=R^DARIeHl<(IZ)lb4o>@V`?oV*u$FmT*f|9e<4XwL`M*p!YSReqT#% zQtQFxYhb6Jz9!nPOSbDm#;M-c;sdF9-*V?lR621*v|W{KSA~pw&dd69RD#xMXshTG zxX=Fub7a}qG0BLLj(?+iT4yyK6Y621WC$j^iKua~nlat8+SsH6vB|_OW|N)<7W);N z^zFvsXRVo~W0cuT+zLcP_X7`mBBKF*tOqK^V}Wr#5_-$W=(d=-CmP^;ZpFYxV|+EO z0-k#2A|9K$Y)ZGAWv*LV36Hn|Y!;_VE?+q1fAzw#cMggFgQ`ryYI``H!ciSptZxy1uTXJ{*p?yw2w`;y^ z{>Xa$mJeV5`1%Le7dw_+!k*)@6YEb%^(ROJnzPB8P=a^o(hvkYM=prmMTxs8WSl&D z)D#}QU;6*`KL#N4CZdT*{|GxSqV;TeRtZrup_4wM-O}MIO&BG73-m@oA!U~h#4RR4 zgq-`NBM313^@d2sSH~)^& zLy>%+>i8)nQPTu=D1fFBKT3*esd`P=f1s4_06fw?-P!^U=|Pd(lC;(>#L`mJmhHlh zBT~!JxzqFC5p7L^ttn}*T&NZ7O>oJU?MrT{<$yY{YM~=>U8>zF*vTbZ&Zz?{=3|L2 zsoE#l$t7FPqys@e6x0V)_H-kdmyBN5qu)EB&4m4P0@cp{D_@pXq`2w#U)w&D+uFqY(IF< zrlr|(^0#Nl;cW3_QLuEjnEE8>KntDF$I{PoXk}TjJXjHQF-@wbjA%lp(}~b`y9Yc#?nW-LXc*sL};f>k{F8kcL$qy$>d| z@i0-3hiQR%nD&x+))K6b8}Xhv#H`Qu!z4_>W+DFCMB_Y1SCRDX;+8lUFBu~ZoQ7fk zAEHeD3IV3_LW-d=+rWVL=+*Aw0kTMoy&Js^`3#%DB&gv-;Se;fq5ksFdo~mecxNV| zz~MbEV{`;hPxW|T18ElZW~Q-BJu$<*vA}5P=G4?(7{i1D{AhM1AVWo%kc?0ILJtBk znuBS{#6%=#iveXyQt04`uwoJ&=wr`BZ%^^EwC53d&jI|3UPY|*-t}PT^_~pizegee z89&(Op`es)0vageVJM}{1i~>pd+JW_`%}h&SI?e_*=fsiSYE^eCZku}nLrlYrGHlJ){$?Sw;Rv-AEft&J!#djN~Ha!!;-LZ0CWI7bM z3ofLe5FMpNyN9T4)b5lSnM4Dy2C_w`B7QyugG4?QWGGecoeqkXUKA^dD=6pKf{wkZ_11YP?1b! zK$dYaC@Ty*{8=PSo7fzFdQPYj>sw?E065y}%IwsVHQT^usiNY~FzG~at38<3K9eaxU z)ioyTI+EVzr>-|wW(C)qvP-&d($h`ZC0zk}3dk<$^3#)Fc1hQ5db%w;w6d;z!)l|Z z0?eI8V;gka53KnPt@;kF7)0M7wxBBg7bpiqLuj!>s@c61lxhwxpO9(>S1wC6L+ISr z9qCSbTHh+v9)nxlC102J%Gc$Pe1&>b&w=GTLeELK`HqF_QaO}(I-UgQ%*o1{waV7j zN(dOm%3i6m7Zee8%ZC<^A(KtG`3jqAnz5;(7*UyDR#Rb5u+2g4GmDn6-c1Hj zhJ>od*%ZM%0~}^SZq@b%jC!n1x*bsAzh5M)M(cG~p-PQ?x((V5b}R;^vF;<0yG)ai zjJ&-tDGR`8F+?}Sy^4l-AY=&N5O@y^muIDfe^`_tSwhFQg8y@MYQ{SO`#=(;jE(aX zEali#7)%}D*#og5Zyd30J z|ADE=AhbrwP=5H*@pdlPO7r*oE>FbZ;aUT*`Ex2igR`Ol>>FY(+VG73{7! zFc~5KnSLyXiO*DGt$8*55=-ZsjP9XIB6~W ze&4%&f7t)Hf8pY))hocDv#fIdcEYfDODNkTmhF+sVBw>;U(+u)CY@#Pn}1*y+*d{C zh~yj*oFgOyxkk)#1%A!$f1~nIhvwfyH!|N-_IX-SJr7kn*W65`9PR(yTy?bF@OiuO zXt(+E?Rr3sx*HJ|Y@hYN;YQ)4PttxTOOx?qFNL~brJ$XxkeMO(%uj{-sN}pKELZUI zeu01a7irR@rW&(fVWgD5Z%Csh>==L;it$hKICE^5%r737b+Z|Uz|_qd;<-42|EDOL z{|F%E95{FW@+tqNa~J(bPFy&4Y%rzYfw5nlJNA7P*&Ld@H_k&**nl%9AgyB@I>Mj) z&j7UCo{}sY%8}M(0|sb#%hEcm<+fS~-K?!}i>H^~mb#APZoRtZz5DOSe-KZc6LudH ztB*_7$LCJcihuw6Z@>HY!fnyfDmhwd%y|BI=Vu50r2psr!hthl=UJ)qtZ@F)YUd@P z^Ab$h=A+39_xx_?*UUxbeY)}nP<4eRw}M(-lm0ph*{XW#imY+T5(;+rV2> zE!5y~6l`TQiw1DILT)d{jo+jFn(=Mpef|BkjS_08$~Rkjya;4K&Rd}K4cbeFFX1#6 zj+qDer|!o3z1JFVWLMtjF50vfVeVpC&g|)WVs0#OgK4{1ENSVj9wN${Lxo`zYePir z^TJrK5t?Cxu_a#K9um8qf*5C}y_8Mu6gPp*3=WHl@JnbM|8szpiJyQ^%5n_m3(V$2 zmd(FI5grnVQ!I`)Hz6{+bzYF~vhism@zVeq1G`7$p>Xn3`cSTUT|GOaCPe4f_^}9Q zS%IwXu@23$_}sW>37cF5pfe0g9$V*!AiU>%BIlDhA8wvHD!%{ScfY$3TRbQ__DYVu zX?}l2HCt4zVnvr!(M6)+oBDNU`Fq?#o#@;qIkySUZR=HA1n-+-RY0l=2+jaoR`)E{ z38j1BL~gId?G?Da8%0)&E9r1AoE97{a7m~eIXhRKu-aX=iq1jFIVdofENe`R2xUIF z=ftT2J}hg6V{uwnouJy6hD7IK$$40C9{%Q=uN>8qqm7#3$^s@CJG~5Ld{Wz2boSS6 z?P_84jw1d2$3$$ssR2QqzMHq4}sz6UFH3N?+JHfrOF;_B?}H2p0mhy$|*h1 zCyOWczxOtT$>3E$W{w?41SVUdNiVh>lea>_9`A)v z28R*a8;|z%^x))_;;{%AFr)k@C@E!yxB<9=$q{iP3(j|es1fG}$oT4CB1+m0NoHaz z_=o?LQrqNu`om!Kgi|GaXkscFBFRiGa$k(zyKKl3f$5L0$lBv4b9%kly_dNrd&_vZ2|#bXD_W73-Xyh1pd{m*D7PIPWf) zQRR1KC)&c2EiBl=Oq$!XYJ>Xp@_`iy4K7Nyi$cax3%Zd$(u8sGr@p!ENE& zkIa=SP`3gto_x-cV1-JF7ejoahnU0|wA|D4-e3u8tLHte)_UHJI@{EB&D`gXUv{X0 zg^61VlkCOulHB(IaZ5?OXp9cAu?GHEpy3Ruou;$d(&D`Z6JpuWmkc}7K_t=6p5(;P zJ=1xowJxpyB`Z`Zs$axxZ+*_eIIWtPp`Kekh@R1e_k|{X;X7FSLXgWq!Y~!Z4!kw1 zqIL`m>BDr@7>|$f@QAghF!-2lcFuuS-QEt8VZfeu`eAem48L?1Y-yXu8D~+JD%((M zM7PX9yv3v=NXlsRv|qvK5v;HwKeT}$F`1l(p*I*6Y=;E3ITq%<7YJiB!6ZsJYYiu| zjp9HyEc`v$7H+15M=v5{vLx4p`GBAZJ(vzdmP8!Pn^X5xZ)-TlksyR5I{d%Jc%t(* zDJlOKc!+gmsgap!h@3*;9gae-2A#Id{%e$us{}<7t1e91^e{;m3@5DU)aVr0pO81D zj5uKKF59+8T@)DodYl#5z!BR+gu|(sU3pFbhkLN;8C7S|LNL z0Iv-sjey7m@G^A#Ji0vx)^?v--F@mwhq(K)wEMEi?S#1hT;lTL@vo}f@7;Po{DW{} zm(;X->APap390JDoORu5f4ukm{qOcKbc)s%$=dR@uG(@*pLA3Tj>h!{?}xWO4u23{ zJh)sUHVjA&1M}APib^m{`+v}nQ|2mqq>3Io3<<{-)zmNCT`~wYyTqDZQVk5SO_gW$ z%lhON@5lE(xc5g7K76paXLZXCVapD7ve6rhCBoJ>WG8z3lE*K2{7H{@&C|c?>0gdL zIUu}#Q}m2Vo>750cB0V`r5ci*=s6>K&Iq0}8&)bxE>MPFv%-JFpxS@4dqxfAW+F%a z=C`G~N+>J8mM8&`o#=X9a=k9NUdN$QYpy-3u02b!l><+{Bf73hu4{tp8u)I`DritJ z(r%;3P*%fY_O7~mmrk#gJQ)&QS0vXJfw^xsiu5eP{2=PVlp+sduUmCUcFN8an^UH5 zReTpu6=8}23L0tu3Qs$ zotJitH--}O8Pjpt6c5EL80w{$Q_ipg93LD=GVg3T}yWa$63v}_*IG@ece7fK z5O#`ZNi-BBURR@r@2ikVn{Wb&JEb9eX7!I-k&_|K|Mh;$+=?C#$MugK8c|u!PWgKv z336iO2-6S)amI?UcoxUjj$^aMaCTRvXfa8Wu)RIZ1OzRt~zclgWD-dMp`4| zdA2%dbGg9YU(N!@&Qrby73MB4z>VyQjI=anvtbe;RehDE81|&GQW+~!aZFVpt*(}N zCIz{etVBh@{GVg6#kv^Z?<`4kSO>5}L&k9YT(Ce-#30fFmvDfc^8CA_gR53Y*dMCo zktjH3+4JyHC9ly!dW`3%c-UHy0&+&#fNGmc@68$FPaiZ%i=|c?#g>+s_KYjlPJ5A1 z_>Yk@P`FE1Rg0}6fOWm5?!(%|kXW-_s=+Z=M#}&&9KpF$;DrgE^MZq1R=uus>7iJE zM5;eBUxJ-vS?$8aYFVdH)~QC~7O~-|)Npj(`V9i?syepzhw zGTT%#5=h?hj)wRs_NiGyNhU<(?R8)ba(hLd_uTz(0sPOH+pH)RZ3YfSW>?0fh1%V{ zTsYeUW=JnN7j>-&*JYE%963Aum_3c8W$U+?O?f0$OJ@dyj|bD?4z&+ODNWY`O9=7x zW6r=#aD1wVu@L0#sXge&7cH>8W~)WaEiXU;9Le3I2}vnc$&Z++6$?q#W-5_chU6@% zsj+RU>^F~=s>=;ib!M#PIeDn!4X4Sw>^jN+_drEBVpz4;Z!eLRLTUZv{|5@r?BG>h zRFfVMn*NY`ufA{)z9~Y@O?lG6ndaa{u|ASRy_<%n)ARomRqH80gjlM?Gm?Ky)s+8t z@XIj(P`54e-_mMrWK05VW?_$U;^Q+PoPnHm&k50UQu3U{&T+lD<&zUXJM-f+OLfar zV)L-nJS=dH>unvMng7K0bKBCjl^(I}ywrAH;I^sOaN)>BvGtPFdP(4#*E_pE+xsVd zKkr+%3ddd(J71SNUl+Kx^_I3zZv8C$hJ+GP zz{w}DPJd0}UK6<2lmMZ2m$2uYaQ?c;-H^B&0(V0jbWyncrpN^(E+BA$oRUsFK_xCr z++~5gtfZonhMqtp?5f0F6}YQP04wQ&aLF%nZ%W*o0{5mi=r!TWsK^B+E+}w8RY^ka z9%0{R>5d2kHw6D3k-IB#cLnaQD$BjX{!7AZ0g<~YaW@6-rYh*DFz6QoVUe4VxCtTS zWEx&AeNbrlzr{GAQ_c9leL2!VOWE`PUM)HXl1Hw;z`y)s#}XGN26D*;qgvpadzdba zvso_umACK&mYr9~FIQ{MNyGEk^<2;wF-If?l@QG1>f733wHHoTjq}@t7{gn%)dSV zcCx8+@#rUg3)}+t)ZMU9yimMOqT|GOmUe#fU8!O3U%12ye8r<5GUT%T$sPNbO6E^4 zTw5&v&@Z~W1y}de?fbMIXo!1j;x(bV6E5khoj*<)j^QZ$hV8#>7Ay8j6{NINwlmqW za}l8n2Sitk;A(l=z9apBlPTA1o>iM?0S2J$lC53HxD6k=g7rrhlBDmyy|DIv zv@x6CowCe*uG|v+VToH3q~DzP%hehHCVaUXsM>Z%k%H&e0P^S^P_Fc5i@)Q_+(Wr? z3lvN{GtaQ8fVpa+LqrOv|6W#=(vqmk=90bBO6~E+->VV~zyew7B4_5<;?}q=ZKmmr_NvmsUmF+W@rM{SFV>^s zO?#9ij(NXLTC_>~bEWFq7V_Ou5<;ebzt5b}0~0|j=mGmPk0Q3GwO+Pp%-BLkF2N+E z7#z(|VB#5VEAdf{Xg93wZh=(~;z;)^9SE~j(fH9%4JlJ}Dhg#V;|x9xmMS6WXFFTj z{*uUoRMGvL!?7BuwJ|390q;?KC<fGukBrAc_@){TEU3)7dK@pILB257>qSqUp;TbNUUV)3R-S>$XJTlkuNT{CGluY%w6VoRnHlid#=f zTThXCY)w5Z?i_U+uqznXuXpbd_6^|>5MlI|(0yC%zAbg%{-I5BcC33E(p%K}W$rIa z|5fQqMBFzd?HhV>RN8k=^t>v0UWI`otZF6_m&CFTsSG5D(Y|dlnDo?tJp94%;sMdK zNAl2zbm&_>0P};%>L!$1c}&0VZkF6T*WCM8-TVKdX*u{W+yA^>bPr1ILBTx;G41=O ze{gysiVxc0b3p=g(B`GD+Monh3>5b0FIfYzsF;XQ$-s6r7!84Fu)s3?NgWF0uqt`DH`1cd~Y0UaYTdrH@a|PrxKp zVpgFr4E<)^Q6)JViB*r5WB}9r+qZaIDBlMs+V)Gf{eo@(dU=ghPUfUI)!ed;3gL9M zbsyJ$Q2R%o4?U2VTX1>^z}%^ zCsuLLs72g+LE3vkth*@HT?Ex!QMd45wW3w1XkB-C7OsF4cWuX~oCMeQZ=TlFW~)$S zIYiAkgeov|mf3e_Ke~`UfTDZ3Mywx@>IX!}pyU{&?mD#Ytok9JHil8EDmwcmXTRX= z|ArbWXBfGRd9rY{>{zev^Y$a2!E(bF#Wp;AQEnWpGJjEJp!*s<-Pcyr{f??*6^1X( z6z9td*K%UV6%*HN4=dwY&I8ZG1z(oPQZtdXM5dOM18E-Ll(gQI zI8^5jQd=WCo3*`MT{nsSLS((RWwZ9UEpAsxZ92Fj{Ud*ETo9VGYGQnoTD(vXXmi%~ zFKzCKoaMhpRW9*=3&{V508y#@ZwdUL1pWg7b|455r2KyZ$fHOhb5|(R+8m0M|Gy}i zLXGnOH$7%~U0NEH|9>d-KNC>VoyeZ3EWQZ4nG^>zP^Lw~8|cw;0rV~-j%AfaiP9D$ zgprprU0KR=y zK%^9zXb496xbOHQnJzc(q?PG<(OYkRJ=i@^(jI0@6!FS92CTkAS2D zriiX;Mlsk-)OtRw{mId^@Tq>q@}%(#o7j9-YCbF03`;e`psFq14CIiN%jf^B4>}~( zLsIn+h)cz$`Rp%u{@LEO-hovdlYdI=9hQ2B#p-iX^|`r|Nk{dVB0d)M`_`=!6{Y&RR>k}ROB*#9H+b?nZg^bG(g2&A&LJ$uMAxQTb zLXhrvbR26nd|7A1{g=tHb0PHLDjD*v}}uPPLhIVUv22a%hn-_?0Sqfw)oifpj31 z{2vbM%26%RqH!1p(g*caL<)GIoVH9x$PiCg;mLvHXb!>-;(@uF2@RelI354e`?u;f0~o-zl47}uYbIiEr~Yul9*lj zau!Eg2(v5XaJy5r?wzHuoHdTPGxAzU%$D2*vB0|5ZcXb*QT0mB-nk=B>$iRU#s_aK4v6*JrTXnl5wxnVWeqEz&p zkUS^Aq<8_2NDd>Tm@zUcAQ#e?P3YHm>=ydY39pR^x9$o%!s3pwv?Kh(Gm@+4?~>PH zMkR2NR2qCucfU~{vjI7GY}v>p%7*>NjWTp4>(mI%&(F(_92j()zHl1{J(e%F=m9kr zfW$Rpm`}hB$)h1d!Cn4OyYYaW_Z(!as-sE%;Dq*=zbNIL{%P??1fL*=G?q$D|Kj@1 zWAk+y3Ad(9E$&jJ)GU7{Z8Z*GVg%@M9BpH~8+sUFg3puz+(SkY;!DNmiH9)U$DwBi z3di6C6_zIfuld^6)Reqc`DBQU?+R5L+`igqloBek{rZ3!%-lH#S=>dpPdog_DgO1myfKr zy&|-|A|C@nXUZ_aA5KIaK8`aRSR>opQkY15Yw+=f@7X|rHuS#xP@;-_OKc9 zkA?%0$Y=;3H}|9A&4|QK>O3qnSZT~NK6L{HoWUUb<`G_r{0D~Pqq7@DHWRnuDK)|7 zcr)NFHbLa80H@PbyP?y9W}^vom;!9Y^r!%5tLYw8a#iqx-fJQ)T@@VEn@kV}E1=Y5 zg4MAC@HmJBDhMB(4X1$8ViTS8uYyipars7(-87_MG;ZkVs)k=15d6+{Il@MR(R5S4 z1fe@!R|e#30cbOtM)Zr9C}c#xVvw%|AmUv@aVltIwJ9;xCP1v>k_Adl$k*+oW-H*P zo+X2s5xyF4s#KzqM=c97n^0>2C5SxAU8a2-It6ri}!QZ**9*#VTi$YT#H*=qum6hH|gk3l(EgXzGA4nPSbkHd19(}Z;aKnWv{ zW|n0!bvA$!l<{D`S-_AEn_lfT{0dDqMwa|DHtA^k>u8diNdgiczsVDdhxqFD3=YH0 zZlF`eup@;zqKrC-2P@-%e?xcwf&gvrQzba)5+{4&u(w;X_6iYxscQ12{31QKLX#nW zeALgLcrvr)Cn-wt)Xh6ku4L8Z|B$>}2y6p*#?jZ{fED}sZqyV`7ugO7&c9xt)L9>~ zzogFci2bQY9W?E~#r4Hg`{T_DAe5sdGGHe@R`Xpmk4m2Zcf| zsoO1R9ciDO(I>6FNgGZc+ncoRPL|`~gTqPJv81haqoTxFgh}WudSu(EJ!jG5r5A?t Pd%k8Nzu6?5_2~Z#a;kBk literal 0 HcmV?d00001 diff --git a/TTS/utils/__pycache__/samplers.cpython-311.pyc b/TTS/utils/__pycache__/samplers.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d93fa4b8d8fdfa7b85aec7b94b680abb934bb651 GIT binary patch literal 12400 zcmcgSTWl0pmR0?%w%fF^4fxq9g8?@&HYPkB<6tlZLc&07n8cGzQ|>C;4ef5HstmZd z&5{+Xv?K3s6M@nrWh8$@S);&;kBJm%XIDr&TII6~)hex)EFta3PJUL-Ix9ul{n&Hv zt*WkS+e{LTwz~S%Ji;A2`^56)R$#$vZH=L%Ui0U-5VXJZ&mSd z&?kkL64I2ER25#4-kX7&IG{eukIShkUf?I<*QF$%fI0A~aXy|D<1tC;t|A=_$`eW- zxx?;^GQJZlNBW>G?Jp+TT9Vo^b22z5Tpj$cz6Ibu`!S=kaAO>w!S!?VfmyVbSTC6j z`x(6d+$v+?O@@hDV~ET!^MJ)g@hUUZKy)q|jVI%3G+Oc-I7697krLx_2)Zo=qS07F zP?Ts?LBx3e_74L+mlR1>dS(S>vS&zAuBoZ%o|CDm@!3mfdtx(+nwp9xQ=+8wjEoHT z%&75%(xcEdR=THWA8UFX^h zTzhd_IO|ymX5qIAHJO+-TX^unpHT4EaD%Kt*nz#n*(s@F4da++u3FS_($4ub#5m~> ztmHcH7#cf$3I4150o-TLF)R$9Vcy+5$IiR18pAQLAWhc&W#(g+WftmiS>zzRj7`0Qpuf=k_5E}EX4_leuZE}c!&(V z&Z?644LEuaj((ddl9B01kG4Eg9v}E!zt(eH?>VkT2KC4wlxU$7I@ID$5FFOfm@Mxe z=*_u3GR#$r5u8hB($*z|lx4^)+^ID{%90t7>%a4RMM2Z-iFnY{J@r>y4-<# z{aT3ELwteb36c*uq(V}dl3*uQMWa(GaVCNH!D#fo86jbmz$juW7LAfUAh$w)au`6V z#@J;sDUpaqfyA3}JVfs}K~F|SK>&vZF#teH^S9#idbt$_!6ujYg%xHUz_H$w>1#vhjR(0w9O|L=K}$Da+xiQvC%<9+zUO zT?q*P1Cu6l(GV#;OXB~M=RYKo_ViU|B}OPZ6;CB>5FrUo$IgEDaRzF|iMzo-&VOMGWMiHbo@ zsX~HJ&PjlJO*hI)S;C=}^WD8b&9kX8<+YUAd!IFI>WA>ZC+)Bk_;>=)8B3)SfKPm!ACYIIE(;OZ{-A{uvt4`&`jT(N6%#F|jx2X6 z%Bkrn=nQpjr&y%$q%59{rBLsy5?wXg{u|`^k<+Crs&sJM!~)1{#^Hf0^55!XsCb|K zAn23Rhg;`SC;j2)tTz~;G(&Ej0w?;bCm0>?9T)y*!f_lx*sYm?7_QoPm4%}HAg1k zlzO07PG@{_LpBKZkoyjoaaF1?Fbu4jB;#0q&CEJCWH_QAtX zpWR99XBPl6yWv6`o=;Bxc53O3FL&r&16ueMJ^TuN4UkMOQYF_w3!+3z0SOpFk_8pK z3s?uGkDNsfsK=}q)$Q>7NmOQA6N?!6-Mw^M+0%$bzw&S=# zbz)4nK98q5p{3Fc;_`bS@qce+0-J1Bb(_ZR*16pUZuhg`)q}-3sFljIc$M@sZ6hz6DTgds}aNJ3F*+K70Nd;Tp;$Umb?4);6rS4P)W6 zmYrFT&NUXBcYxCnDz?A{A9o(b>-GXU*KQe zU2~oQh{8Fqio41|s=ejG3`@N$ zvTm;TkN*F!z{2N%7fMTVD6`L-f#|tLxI&(MUM>52r(vz<0+|53v$%-ZFCK#eK9d5d zdovIpsi+I>9c{rDaP$w)f9)z;9nW6@3;!zns5r9u2kr0B#%OvFZH#Kg(8b^?+jVi} zGK2oD180uvAtEW|h>PlAR#A}mdFsx|&s9d1PO_zHFG3FrI%J?}YkTRLzj(hPC*0k3 zZ=3ash>O!B#FmK!<)hGmdX@*_uH>T@bIE6@AEcm+A%T(y+;&wyfX{~z97cdn6eu+G z^GjZ$OG3g)girl=cD?ik%zz5BK>-#|7mr9p}K#$efOtr zk7m9Y{<7isVeQ}<{oom`=d9jy_NxQ>IZ10D*W1Sn!7!Z9+IJ!(bX@HCv>V8$)%NPO zz0i56A?sQ3Y!0MY9BdGk$w93tqBliW7=KNefVHLV^w)bghZy~g|xV0r$jlW`JbIwbLU5@lTZ2My7pN5|_ z{2{CzyPzMtpdG%bAHG-^{-r)VrA3l@Bw1+KwXy@Jva*Y*ZCLor6^09Rlt&o+@K*NL za#L&m(r=nSXliLv9@Q^TYaP>2s<*zEJGH#6Ie+kO`d)f*@S|I~TgC8>e6{&o z*$lW4go>fI?5&j`)3_^lYpuFN(Szc3bqv4Vy50o(&`&;y0ehpgQJ%Cg{IKzt2q zSpvFJ$vB1k+Q=CWPa-+cG- zy~~Agx3;DG@n&ty{=(r`wT4%rwBn@3t%nK+lqw}R`1~?T6%cbMA>>N{sGu_fLny7% zEzd#qJ#6tZ0I>dGo9HbsB1fIoTFw@@MqpD14sTx8-@F2Ev^qft7!vf5P~Zd;F{J0n zX*mKH`2_?!0ocP-=%&aBNO&y+H!!OCTX9ukmUnS93bPRy zVt0Chhb?fPb$Q1il=Y_}L7SlCP&~-YgxS$ALRL58gQi|#Z`uQcTwKI7fI2I`4b`v1 zUqMq2h*=%HeTHr&c(xAYWjxl6j!7QOO-SBx7m1F!@6?R?NPfaEdLcT-kt{_3Hi!?h z6e(Gi#90DQ5+4NY|tF&V?)U3tZ_@?Oyw|; z0{BP4t>HT%7ERJWP_H$5wWbhLdBF{E;sHa2J25i$B=KAL!y= z>f-lz@x5LAzIQ1JOPtwD%#xD8B*-j;75@`%Ae>bZ(*6wI{+x`@ye;FCalw-{rj-q` z%w-UrHuu0OsVn99oZIfOWWaT}!#s3X82oNz=#0CvEntEBjT}ZW$gwN2keGu*aAO5$ zT8YoW&OuOS-n~c~SAvYoiS)P&%)~a!OqAb&1`!Xj5~~XNc za|1Jf3$2xI03hQEp>yD8>4Z8}pyR8~jWLO#%+JoYv zQ^y1Y3MA2%A|l$7XfjJa$u{})1J_U4av<&G;B`?a3_S36FbpvjZ!=5nOM?$PZZlNE zvocP{u`Ea;vMi`*3@g0~9AR)L(6FTk5WE`jWQj#g(WGX%d=e_GHOzkkk#pEVD*&Kz z_y|3h4&>Fv!B1{jFDUkYTU5w{(HgS!)b#e@_LIcYCVIs%ope)DB3`OiMHqn8E_u@N z=_n1kpo4Oy%pRNI5fWT>scg~sLnu?`0AP&8nYV1vxUkNJ3tSlDA`pTI6dQTCfL&^E z3^2wQ@h6@?1hhl1>4#o}M=f+t51q?;mP1=|;^L;oO@&a87V6PMJy}nY5aICwn7bA} zqKA)Q7^9^?&b6F{`<}>`J#ga;mD{a4=LRaL%U*?6@;HKubQwuqtwRjrPJ9Fn6yzDq zmDT5l^oHG9s8bJh7P!toJBn!9$z=H|Q6!epmdf=a=G%S55bY^5n|E~n*21j<*G3ts zO6IZGb_ClI>_C9(Uq)h9B=6)7d{=1+{R&?qKoeC#vjI56{H?fZ&=NurW?p)Eh4Fch z60mr)e828pE$amr$pVnW183=ZfXs*>5(6Bp19=;dRrLKTb9j50^Q#<^tt58YEHA7vrXbCV{XkN+ z-0>J$Vv@&)5%_R3R z{+CjGViLIMP5Vbfoj1r^r3)yWG#_l18`^&5E-1DqY@Z}`8iifNv^|DYrs?A@%EyEP z$6f+c&*F30pW@-i<~W?XL}f!&G;d83gef*H?yY( ziT&q3PriLP9SM~6HkIT^vXXyNPy|(#<;(Cy!)r*n(k7EljGY{yp)1(Um1{l!Bu<&t)t$-?hq>s6O>?xL(wcKH8|3}b9`3nHwLxQEwnPAzrxMwM@aYuCS zNP#<2VFGMxFLd;2+xoJ>i)eKJ#e!@wNx=JbXuM>C;>K zvac_P_beTL{Q8rfTKJ3}KJ#4N4?z){!uVgf`GEPrOVb!&?ZY!YAgG`E&};ex^WI9E zy)xwM@Dza8?CA!3c-_ur4vqp}{MPyzbDoUH z2m@_dm^L_VeUlcABFobVZTyuMn76fMe*$;48B96>_tcfZz2@{Jde-L{Vj|h%M^^Cp z3G|>~R0X)G(A-DQrC9#JCr1~DKN+xItu>)M(1_URJlt3l0xR8_%cfR3G0&-uZbZaL zDA@x+X2lr=+Xua+f9fp$1sW-B0B{!JK6)KE1WQP+LA%yx7KXVzrh#(9}cjT>TV)GqNkUI+9Hr#sTcHT@14hGxi;R z)ifrVpV1qnuAKYZ^geu4eiQqC8Zal!Q1VO(>SW0ilj4arJ_#}So7dfS#OnW7=tUU< zfU&#K!T^fdBg>FJQ14~5`VPImW6AR{q}3lT1P_DbvM~GG`uxZz?`TbX^rk(K2Z?G; z@YUZjJ#?(V9ito^+}Kk8}(?Ov&|mC*9FaJ$b7yk>z~w-+e#JV2&Q@QZ|Gbav&l z07^b}b{ech+gx$#rABIlQ}Y}5t7(JNt&NhOd_+S(C)tgU#KR(v6QKmA`RJG!7lA+r z{it$4ejgq&exmFIxZ-A6w#Zc9Cch$6bDR8%OlM)^^BMDt!p7%{ua-4Uc}jaq>89OL?HqMEc#=EAfVAb z$zYNIIp-FuSVhTh_vEqp*kbY4t@}9lbjBnC6(>U2k!mP=rndZr6 z;-T3;eC{x52+2{&x5~;O~UL3;yo7nei~jskTLG z%D^;y?3k>H*D~I?Y0Aj>J~n>|mH0}n72pBpZ`9zUS%sYyE%0LX1R=s}2eFrGNnw?(Z7)U6%DUCd^t17h5j7>0)2*q&OZ z^W&OH57PzjAq>-}gZOTM(6GKInJr8&r0;`&Km5bkKUH~c%mBdlE>d?4lRm(*L8*Sc zj2rWDR-(p^0)}L`NbF`jc55~b!!5bUJHfEXY&3II@?Zcr8;{C+gftRYK9j7ogg z2yZGWXHD>CR^!a@rZ6-rTLW*kYFsV6)v0fF@MclpEbwMk->mRP7$$~fZSYpF#yQ~4 zuDmf=Gl%X??Fka+(#5%zw*&?GJjxqM*`U5*`@G5sLF-m+A_N%3o+jOk10 z*RRyRg*?qld{$TgtR3R~Y8*AKwv6r1Y*WH1YPwbN=L3pQ`_zvZXFEwTRm|ZdL)O3! zLXF$B`4m5lurcen3RtpH{Z+itz^{Et{p;E5OkKAAHpLD>KXoavwa;ullme9Sl}ZgiwwG|? z1Lz?wRK@=zpueOI2%Su=mbn0s;jwrs&PJ1T>DRfZ^u)wCP3E?#Xe>@orCE4`NfIZ; zLHb07&H%IH=qL?*j*g}n@Rd24jV@n=7m+w+vYn69z$N3coAeZ$o}pm^rf1po?F18N z=uG@hX3(Z)o`5(Eq7xj=B;+Jeoivn_gvp;yPtjZ)re3HHN6&GI)D8OD>|O3UbFHdG z6=J#Mbc&8gp$1BB@XSV8n7xyCHN~HpqVJ~X=$kOn)3G?4i6&B5^;iPBCw3Fk$6(gy zpi3|rOKQ5AFZGr%7M0!V5kC&kuuvYLdn!(7*V~8`_;CeWf8SsYz$rBX49+Eo$y=JY->9sYZN8?k`xg=BMPFG8MjfPa0^3n#aFGeR_ z{7`p+#7qP1+dvI~LqDQ1s?hQyOZX8DQN$j)|9U!|)ZuwN6}_H}4>1W2pEL?bgSv4( z8clKg-gbGYQE?2`O$;OmQv{q9lae*B%{~kGATmf4Dgl&V4zgbMQ3J{@d&aO zsUba=fh9pi8&)?JO|V@2y>Pcw!_CcttZ9kB${~@7L|BwASQIK)lqFacAy||hSQHFc zlmJ+ql~|nTS)AfooNpy7k~bk)$)RAq{Az*jmVK@=W<+M9DG*IrSnp_rSIMn#5jEbC znwzOevXTRnD~UMC&O~mZge>3yu4F^t@(4>cQCxSiIQ2;mIZ3%q&`~7E?U>Xg`^sGz!D)qvybUT$H7x9s z%v?M<^+$9r3PGe9y1NzV=) zPR~r;J%3^-HpgY4jwz7wxgnTihvqVgBsWxKRS+oR-9fSC6D|T+fXM3@02R=L0r;<(y zSKfmIn69K9TRT2migY(VgT)|uZ!HEp zO^=yqiTY)bzfkT!KFHjxL`jb+K=$|BgNidHZR|BlhkQd-p$ch`nRwOn%Jt z7AfV-FU~A&S#Eg(rR)>!`*{04EF~rO4)brG6?@N>bNEVBEwrKN@-Lm>T|ICY-M$s~ zrh@8#Wl zmDK!!331@OCV9z$rO9P}2WUb`W~9@Yy0CyCTzZ+{|J@)s`la5qnl1}M;bWs-aD|nz zY5p-#cUcgTMK6s+f)XN14L1_E<0(}Z(TFnZg%PYB;r<|aWTGqrR~QS-DGab*b^3do ztXr#unai2Gv+=9L^ktx>7@Htc%vCkZMA#@<6nIiiEE$icAgG2&iIR0(VNziOdjL9v z!{*cM2)OJU;LNw`$WO=DW-&c7V4y)Cd$NIpd`>i${O?| z4@(vk8_>bf)IkNYP7S~qVQ~TfuqpZ(RR1(6=?H=ly~H}1K{M7{2Wd62EPu)~LPY}I z1}hMlCWUX~dc zO-I*qMKs3LulB{?!|0G8aawSltwtjG;EYiQ+wd8 zYDz0l;#o{Q0}fDA8`avAe^+SPA-2F&b>F$_?_Fjdr+EK9!M{)R?;|sg#eUzhV(BSZ zdh*Pp)RP&(GAdd|dCO?A-iy;lCouN=XJpnc*lE#D^U7T^K{QE2C^V+h1r(1>C%h5V z;#Cx>SD1hrV?%nZP??r8*U_a)fsjh|=QpG&Eu5@@aAl}rMKey+2`Y@~T4({ev^C5c z3H7R}@~RR=NsUi^3qeP30_uf-*gXHQRB9}XS&Ed8YxpRv&RHS=YM0L!<21-eb2D)o zs1&5)3nxIEfLekIjonJ6-{GyHbhL`|JQ9P@t22{XGPFOnwtSIshQ1uT^bFkpR z1tl)S5wMz}g4`3oHA>D3x)RyY9KRh+j)g5Ob{D%79n>sJZX$dW5mGLJ?Ze<^f~Jau zS%Jl#z?=mCO)~zsVE{pWL?adRhv8+uOE;V7Kq^9O4$^Ty1qbNBMg@94+x!rPBiCw6 zP;41o&OH85XxT5e>_@sN0u4AFvVfB(R%}HjmiD90G%?&~hl1pa& zI2(~>5kg>3K|US7SMi;YLKVM%6H?E&ug7`>s$l#>NW}AX4`Zz#!AdRmsX?$Jv@~6l{&qAWXNdPwHC(X z8huJ&`D*Ol*q|&TA2sw!ZA4 z3@Sd#mlhUd(K8U#yH{#Ft?mHjHLOE{Lxg3uPavPVa{|l-7?yTQRi-{N{aK^`8U|xM zrE8?X|L^5h6vFWOiAHQz^n+zBkTQi5a@VwNy@X3dR8@R^g@mf6sm#V$iN6mKSF)p6 zJbM#V!!vQvGReLK?C*}vZ$3wY@kLSUG4Ux-;ekLRixtS-%M6q~4hS{zTLXI*p6mcP zVTUX-qZk3E2hrqQ{0PgYS@r^^KmmfN>!bnDih*WK-Xo4Enp|YOWp-HUmkOhm>TX84 zXePswq9i*JP|A6U0#K2>EBe9>B=4UVtXgDo4puXhWmi{N^uU&l00ior9*m3}Q z^P0;mx_U9|2+^~=S+KvUs!P^n@r5w%%%cyVI-gGv*8R3 zSp5btE9@Vp1(o^h7@K+it?{xpN7;!W*jSOj0VB&&jg0+d6Ob?nj($G@`$P&4Dd*<~ zB^!y3fG&rV0@cVaLm>;>v2Y!G6?#&ZzOR8#YOPK#gTgw=3hyYU$9WSJHhbZrRoGmC z_xYVV6n|}U`!@(H1I|6_+xmvwjtBemr-k~hV*S<Qn#IC(!*E{&$!IQgWMNs8D1gz|r^+e@vBZeb( zx!rPS=`=>FNaBYPufoCC)EF_f49LO>FoV!wqAIUHGW;1DNFc70ODBVCa;y_PR15>S zs{ge_2-9P8_4HfTHdWxfmI%SrfLsp$a9#8_&^_vet!(I#Ue!!&+DThUBlme_y3(qd z^hRKW6i|#WXRYJ`eUqx1Qs9!ZD|A&|%>POKlZ-rMQH*=LKT7$eMZ8zc>+@%C?WU+MK6R`)*3)OhWX|3*(B&}P>Wx-zd)l4#aIHgrmB8@1JMxeHOgp5Odu5q zG@v5dJHQ^nOjK*-v7uTYjXN(x4$$ncGxewtTG+^Lg+@qq<=sYj$5}x#qd^6S9aGjk zN zgPlIdKMs7$^f}N1${)YhV25MWWfYjZvvBJ$rPKN zlWJfee)g_pOMtNxmx+Sm6YTE+sel#qENqsuZ1gUR!$@|_m4!j26tZW9Lxb&;Rq~ix zuaZ{`oBs>MC@T2|%I<-Mz;>a&L#*#WVY$25(D-TJulha-KL~$1^s6DEAtW}077wr1 zH|Anr>=5c{v7W|+^lF3eLI2YBPqsYR0?`(CZhR?&jPW=>ewnvi5iD0k%N6j*cUc(X zh&dD5SA#tOh?1~w0>4gGkL@7Dg(UxaybiQBI;2O1<2dzO!x%7*FhG&qvcW9O&Shg3 zr4R2wEPEG|*kmli>QrgFvJgGrrkzZxq5V%t!J+lzJ*wE)^5E)9d@B*1&nPb(|}s;V`hO(M=S@ICOW`aKo_R$XeNModT;$62%>gjk(o2 z?&8qG0mDgMnYbg7wSlo^SJuX8tjD$TYSy;#E=sk5EoyNWobY#Ps!TwE9LRN$}&A$)_{ zF#-T^kUB%{~S)Vd{MP;L2|nxpNb*e_Lt!tMvjwct=N9rdQBa%!=Y}g+ex{ zQC7n<7Tl>+h{~XqNBQce*nJBg-59#ECKy8;tyjX$SCB{n22VX_1vs8Bs@3(LkAO}W zJRf-8H@r{XF&R3)l?Ma%C^6p>I8wWz9dkpsTkkqy6mU%kWLZ;~Mpt zNAyvZK2Tv*4jKHcW8=~`K75d^+Ts9C+_caD5ZaUuWXyBaf;a2sHf3-ZseEZu z?$YAGdx9^Z!?> z46x9cZDax!8TCy#SwUP_?DWPF7mQ%N*D2k!=Z81=p%L*K5b4#`g?-u;2z6>f*%h&Xf z;)9erxhb#Va!MK~&!$hc;D^?z^q0ql^|~wT&*Qpe-IcB4QdN+fboE4Y336qd^y%2` z*`^AO>hh?R%T(nl_atC8Y;rce4n86kkhHIZmrVe3hf-hdvkHHkm`>}8nJ?1bP zzO~T$5-GQ}D$hnzZfk1S#`UPkMp;W`+!g|FWS>eXr6kZOPhR0za*!%GLqL zcV@lWK(_U9=u7CQuhhOmeqZMCTY$%F`s5#Y`9JV-1%s65|7Bjj^^fM|f1iBZuHCDh zR=TQ6me1DDgN(J%rdv_j^^~op-^y&jXk-VJ4Hkkr37|XM_UF_W>$E4jG(Fk2Y!LHq zlrn1v1%BUL8wi=|0nVA|jYMpaetRxOM?ndx8lK5VHWj61`B3H`q1&%$G*PRl)m9O` zt4?cLj-?ZfypaqC_7Yo^K(dR&>5pJ_b?L$D)L-l2E7AG_)X>){L1_T$Z~M=u;6NMh zLu{sr`Wbg5uIgGu^vh*iait-c6}7%)EB9nN1sfmwcCA05$Yt)rNikBYe5OHH42i9% zy%GAc@rl=iNluscZvd2h^g=l_K~B@Eb-)4+M`xq4M21bLeyAKaUOpB4()z0@^lhj< z2k`m@DZ_+nP{!hF+$A{JtQ=F8 zO`YgCm~(*r(=|kwYji)^wzvyH@O_jp4d@W1uc^I=<6KTtJ%nMbE$jo6jyaruZ!ST$ zLk^xhF*x5&kI)miK}fm|cS>}fMUK~vNj4nQ2yE0P7|DDGEKrWZk)3)xfhZrHa1{ycH)$v7=xwE}smK(lhJ1%^>~qj)3+m zNS0vkTq}2h5=1(n+|rFf&%a43bSA-(lj-G*oW^9G@D9??@g$hfiq7FIt)inftiFF&B0-J z!ZzUyP5DbE^nABsC0DkNQ_pd#XSV>O|6mxLUsQ{z49P1>c;#RyT%7EDsKcRftZNF| zZCv^J1G>RbVI`O!7!V~TJ%4@Oh$wM@sAVYYhDFzf8w`sA0wydN7s;j=1x9CC7BL&w z%94#!KSl#KQ%MXzA&1K-frJ9q5pWC}&o*$9MLCZP2awAt0A1NbkSA=DYGi8(1S8cR zK09{w#4%zT!Tth6tm-@j#gfiNejvHfvf~Dum;#)PBoj02eN0@hR3Bok)EWGx1@1z7ay-jz^PQGVo5r z7ck)T@-WTb0NCL@U}6BFEGlT=G$9x^WLR9l%jcX;v*}s+&^EDeLmd_|-el3j49-J? zEhpM%kmg7p>{+EMV}|$9`G^34FH^B|=&O}?z2)=a zNY|3(RQxWnKb6e4;&(ZA5d#CU^bCwdyaH0Tz>g&pv&3>nGN*8$ZTKOKnK*vgLaJrr zw*jeS?D#oAVR5O6!5KK=E+Z(1Ax1FCsb}ITH0?+NSC$MD>`jcYszI_fPZt8gHNU*Q zU0dGUw$7!v_vYep$Z1u-Fd`YD!*O*C<{fgRj71>&Qc@Fkk=i?4~rq&0SKDqqhGVWV@7R_MpTcS)>=h7MS00ZHYk@6fjg2ANM z2?mp1Cmf%4I7=4F?k-U^R)5jw&+S|sExOtk#E{5xOa)}UA%i&u{p5P{6?V} z%&3IsonrIOoO#vm$#wty)csS7r@-pc-n`Vy*VAwpgQ5H>G5E&f=>4&xz2TQ9e{nMR zQ^6jBlis{NRBWSRQyq5Pp`K_t42S<(tqs4qxIFRrU2)5)=U0TDNwH^=gce^ZVG2f)cqnG(eyl^zmAC1Eq_6Pe#?~W2>@mw;j^>2Rc6#I{^ z^j|9UUn=^$pL9R#DVeD7q~Qf+3^zgR!%e)e=XbSKc++DGzx9yNe^~54{A~1jC;pxt z6Z=m?JlxPQFduCfJNJv7C!g;XJKxEj%$@w!7rNdSx+cV~2_bl144&sNToi-vj*DzB9GN3 zx@0sozGrx0GBr1RPk{sD=4mL|sGczI?u2^{d&BH#$eSNId3t9#2W#Npeiv*z8d9NjO_ z$(dHYtxG!|jx6sLyt_s3?h@tqG!@%-Ein&MU~*ZatQ}3oAPt|Ic=%y4G*k@rJsN>; zS`@p7ik*Wc7sQezB{vo5EP0gI+_BuT-}$MI{$hLA!<3r8v*%H;7}`?o+JtWZpbV(g zSl`j~f&%AzbaKZ^eyS~$|H;FVoTb(s#YkcU|aX#6E`ib+3Vu_y=O&?v=jLLfXEOfxP zGgkJVF6=%1{9R%16>;wsq2sF9aW!`wzs|8gzgzw?2jGKe@QMHHj=w(g&1wFv3&Q9{ zar9zg>qQ>!Yg6DBJNkZ|{XF~G2M<5U9fwZE9B*LY2cLf+bPS0dLl7JYtpqj|0-J=u z7BR3TXMvjddX~+PU1I+}-i!CM?r(;~LqO*V@z6B9uXb!&KK_LHIwkHrCUhJZJC5g$ z17?3=UV7&<=R>F9?#-Ej7M46qrlt62^@2N;Gk;=!!5IMcN_Py@rXk+m2RH?X#EW=n zrKd!>9Zi4P{KWTlySNn&k6abP?}}mIdJtUnhD(&o)AX0q{GPM_;jIZ_@Vq#9o&*6- zKlo5=2JE!~V_n-hj5V5A*?Ft5^H#BK8y+2K#iIkQYpsFrYmB>synj2)3A^@ueUslm zDeSx~?z{}sRlXB{OUJ&)^pFH@;L308c>IAlbVeL{hrg5-hh};IX1Le3?fiOB+;(nd zTe7e%S@icnA=`1n*$xxV5CCuPDZ)gZB2_E41Rn;A?VE~$o?^>Dk?t+_>?wvuN?u#D z^92RY_vkDhD>YI6j>Y3eZwp|(W71Fz4J>bcG`tepQ3&l2LOaFK&Xv$`Av7$6_KKms zE1^S$&>@xqpIe#Zo63Em#j+p{>$Ah^T{WZLhpXwc@S>Fa!9ls z;w^^&sTb=x{Akp_-PU~4M*WS=cFJS=8{h6zcGJJJ*MJ`mv3Vf+$2((3*`Gp2Sf1(g zkKr{DZp2So5sgknvPJML@f^sp5y>8jya&n!IfnfLAV_BXL>Kw)5s15)XeJSZg~-h` zL%zkr?!nsOIeHciLRj2kWv`%vM*J*psFNj_WWvQ5?0r%!8f3EHfCJx-lE2mz12f!6 z1P%}FgHUpK=zwen>&B$$;NBiL3ebLk4`f{V2UywBS%kYYRP5@;yPYn(!H!}}fZV}i zOM59eVW`2+?~uouKTxWX18S+(V5v^_g8~~|h zNA@>hxL5Z3D1hH6`qrt|um7 zmNi{ZGvx8yOdi#aA5ttImJ@==4R6mztJ7d;$seqlH7o;2d_IM`;Pd%}{NyiykdFZy zxFfYScsNVrx*KYG5fVJ)*7VA)=`FVg4q-r)#_6|0hS4Y9$0x9VAV%*rIcly#3Ft13 z=DiOg75bN5h+3%8q$^AgsKscfL`9+b~X z$)70*U;wN^%YoeBZTMP9`Ac!+J06l#NkNoP99xILE+8~?IhT(6UuwSk1m zDgq@Ufe{_r2)RVEOP7d6$Qx0hNVR7OCndd0lu~kMj{t-}SoX?nc7!d$E2_x3E8xP( zA%mevS?`fwk*dE(enqO8*WYVY1Fzgg%6^ahG#(#M>FzbEi`U;pY8$V=i_{)oe;284 zUVp=^Za8h=sZ%ezi_`$Gzl+pQc>P_ZcJcbVNFC($cab{6>+e$SwK@Y(#>Q^0>p}Ph MiTnN~DM;`CFRQ38X#fBK literal 0 HcmV?d00001 diff --git a/TTS/utils/audio/__init__.py b/TTS/utils/audio/__init__.py new file mode 100644 index 0000000..f18f221 --- /dev/null +++ b/TTS/utils/audio/__init__.py @@ -0,0 +1 @@ +from TTS.utils.audio.processor import AudioProcessor diff --git a/TTS/utils/audio/__pycache__/__init__.cpython-311.pyc b/TTS/utils/audio/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1060a4383481e54e8a358e7c1d7e63c767a8293a GIT binary patch literal 255 zcmZ3^%ge<81nFs6X{JE>F^B^LOi;#WAs}NqLkdF*V-7L-^Lm*nTh=jErQ7V87e(+8WU4>nIfK0Y%q uvm`!Vub}c5hfQvNN@-52T@eS+P(~mwmIV?Ym>C%vZ!nl#z=n$0fGPmOz(z6v literal 0 HcmV?d00001 diff --git a/TTS/utils/audio/__pycache__/numpy_transforms.cpython-311.pyc b/TTS/utils/audio/__pycache__/numpy_transforms.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1856323875aff393dc5fe0f5566d47dc4b045c47 GIT binary patch literal 24021 zcmc(Hd2kz7nqT82hz59xq$o<1T9ho25)V;_MNyY2$vSLVV~aXGvNaUQ20=(5Ky`yU zI8e;-dJqCW)khjTjf-o3R*#8E;dS(Y)z*8&juUs zq*9YqCcp2!KF}bgk!L5hP2l0{*YEt^cfId@%^wDXegRix`qe&Ux8y#kmUh>pNG8Uw6zs=0E2j z6VHh#?=t^9cRaE;9FQyU_2FBIuODAA?7bK?hA|S51MhgxRq3^YcU*E6%2eY^b;YnN z90&(SoGXk9!iZ0PDhHU$t%#pvaejL z8MW%=mGY`PjEUvuN~7mEVf;tJp|r#@poW@(%g=I~DRy+xUs*dhy+bZ(q1Y-i_~`@anQPs*(4i zwKaNc)Y3lCti2_pm8Dw<$otWPgce?q`!Rm|x|L(`0Lrdqy&aSfEuvNG=|wiirD*;z z>bB}5s2_{l0^sY{(E4*FBR&b-+9`+XP+eLj8n3J~1{p7LKW?5{?ayjD@ZlP&BSopbN!ENhKwu!4wfm z!K;Lm6H0s-FuOcRsW3hKauUgRgp})BjVV_O`fJiD2$vkl6jbL+!rKnyx+l?=?7HN( za+3t=*F{#EZnCJm_(};WN6MLUr(E~kKS19fQ~&G&W;~OxADD>7WQnFV9v)6AiHH&! zlZHZSRPFNnC0r-j@YOzcLmBb$uDBdhl+g9QC)A@-C%&NJD3k!DPE{#*;)B3t!5imD zbdQWh;{~LlYwfN=MJzg`B-Bu$Vk8<%h84Bo!kF5V@t05(#s;~nD>QIt?dA$3y zimKh$L+WVvp|E->nHcXrm>3(me)?GV@PwL7j19&Ua#-y?b!wn{A{mXT-JuCNn&`&T zj$a>4DxtVKl2FFfuJP-I>LEf64R(;>Lm_2eg|GT`BvV3hmEfz%Rjgd7UY^-Kw^m!Z z`C;|uobjRDHm$lf``W|m4LRfUNUWRP|GBvOp}0D`dH(eWqq)=Pb8;*vuGYk{yf~I~ zk9~>81z@Cl1S=69jD_QoSz=cIi_;VtVCgD6%v!aT(}{eF#5U=0Q3bK zn8f`Jcgmeb&f@C_$okmKR{bx*uCnL7?MPC~`o(A!EvB54pr;>z20u3G@P@}$(vvhP z4&~kUvNS4t&JAzMlk$!@L7lG2-@)RIPb9{qS#-g!U zR1FU&;_?$omNEMq(Ri{$LUO*3wWl|-M6(G+8aQ?MR2L)p$sXyAwdc_nt%iYoNgYj0 zfN$jJm8cxnADT-QVOVJR`A0DA``2m`lf3aycp$yd$CE zWI`$UF;RW#3w}&qhgA|BjMgPdX`%{s#`Fi13B8cA9uHdaRij8ir!_)VOXlxr!Op3} z3xPULeVQocMJeY#O`q1&n)`GffG9nxtxpeTuV@XO^P<+UTdUoZuiZ2C5@nzJ{&REQ z?{$9EsWo(KwOjJFTc%!G5F65yIdN^yy;f;PZ^NZBi0FmBUw#7#sL&xhF0toIebS^B zL9irPs*Dx}1Y0???ZxHpI!}B-A{PjE#>WeuSR&H1rQpKPU1z~@t>6X+hgE98K%9FJ z50x(D^x&&*L^35TREX2c+mln1kE&~DuVoxF-<XQqGVt=YG7zST=i@(UWULa4dxQabQca7RMWb)A$=_sR}R=yn)ib`c(k zO2OE|m)}J~;s*;42r7!D<%wO;7(~K^kkkGQ3ehHpN=3^>>X}_KRH+@iDX0COgcbyh zJLJR+4<_PQ!b(yir5{aBfSs>|ViRFiN+u*ZJRBW@Zq#*QKNOCo2(L+PmM(1XllsjD zN^~g~BE++VI;o9wf1h-O?l`x1Nr%ECp@~?Mb=hOpAflDpAT-eU@kAt~M3bXTIfu)t zYI$+_3CwTk#Z|NZg3{{OxoCc1k#5gt+Yp^7P7NL-RN!B+4J0GqI_E)}-2({5M5 zeldkDbdCvUO|gOI%nKsKAqo+v?23@6MwySza!hcn1wHhSkH@Y{<4U+QJVrZ+DC6Jc zXjo!KrKCn9@ldQwf>Am$5hDsw!t%s$SfV1s2}KFVz)^`fI4pTJ9F9xza5OSHl$cPW z@rYhq=fr1|KV-@pz3Rq$2&Peq$T~R@Pe#YWFDgnx>62bn!sf^c*&`CBpoWt))Mphb z`ZU6_ebS8Rxm;PrHv!}8zX8m)o%^jSt)x>Riaa+#4jZtR1lZ&*!EqaVN44Pbu9w!@4hM~lu zE=tLQ~_oL{!^b_`ErR05t?;O#LiqF;|SS2i@aey$;VZV}cT zg-UY+RdADBR9-^E!}zj`_|TNeWK!5=a*1uYgvkWCTvZ~MLlW35CaJ@rSXhch&Cl zDt#M~Ku9+Ufht(N?U_@W*qRqxbM98g2zQ+dmEdg>WLrzq^!tx^^4m52zKZ-#IqcH^ zhO2Z>dXXrOU9vsj+^Lc`#%?m&???wsZZj>%G72K9Dj~&EYk+ek+2fIa_S4V*>_4ae zd@r0Fy}0MA<^|n`ViEYFqVX%rF3Km(+3qeuI@>7*@!axFl<8iPcn<5z`0W(xzm2c% zws;(ofH4yM0S@}-e z2z}}BhbQlx$-XkbO>5biZ`rAdy?L=W=kDb~(ot|-4P8-AqdSu?X|IduF1!Sb0K7t63nR(k}!=h zxq_CI26$K|49*(GCC29_SDzY0*^X!8>jBFF*HzXE$p*HV=Pz^_P4f(C?_NBj@pO|@ zLw5mOVCuKvxPF(MZ*Da?lGKGlXh(M zH_2%bMQ89;$*O~#s1quKIbRDt(<+km2)~)?f8_JedZ#bXRL)jT^)HBlTfH}X-#>8M zpfTOzgScXyFVYHLG|%k3e*~b6WL!GQI2p;$W<+76iyb;bj&J=Hf z{9-1l7&YiOS(1CvY}eirlLhS-G1-MOcT$zmlX4dAV%C;mew{MB3V9Wo6lP1MoTFQ^ zV3arMxyM2iTPWo`J^BU`mBs}l$S{Na`f8C-58=^wsLZQKri7pQ0#8_~WjWu059{tU zW*r}_xU*s|id%f{KhJ-kvVIXXjO%~FVpp~RV2R25#J}c39>)!-ge-m+U-q58U0^dA z0h4?Z4t7WiwrGrp_gW&jM9Oll4m2 z+YOkUVuTfg^DR0CzF>>QrbR-Rqnk*=!nR2L5_6zCC5%Tzt}0nFFrIvYTa`B}XT(_% z>>xJg#BO{t{h5gmjslqT@{F9j+b}ViW~Kg?#^5BL{AL)0LW7(o4>rHxftJ`$rmhqmytQ`e@&tJ6qFO6D_^68;g%uarODY!>N z@%SmYY>9RGAS}EHO-h^WHD9Z zE8_bl83J}$S7HJ4lN_7~k%#=A;|HkuG1X=l*ugpZ!{huEg}JY#t4e5m9AwUI%7dff z;Y-qJGCAJY-L0RtbS0EXx11R69)mZkI}(nE>C8(uACuzf8cmMHBpkI&jA2^vh$TZf zH_;n?>fsQH#8~*6gu|7gu!1IVWTe}uIx1^0bmd7Hf{d)E=Oh=~eM{lK45i${g@|D6 zX|gC6tklXqd{y#?LT#-V0?TIiEcnHRs>X$7E%d2fu~6NBPqEq;@ctF{i{8Jcq$v12 zUe2JN@-J>b!U#K_6JJwp8nB%KPN`h8QS7J(n+M(oM;j6Aif;E$9&Gu$$vy=BP=3YCSyD*Xy zBbpe=i;`0=Raaryn@*J1CnJfdt z+Ek64* z9L=Ez=Gn{<17*BCPSoZXg zaQ^5GuWi)k3}pLks*P5XL@xTpu;4M~Zy)25EAwkHID0JRfv~MOaoy-c|9ydx_+&40 zcbzqi5BR#`iKH|Vje`&+a?)JCIvQ5s!5_F39mg&&!rXyJj(7dsXs@i{;(mo`$y#m~ z(s5P6IUYq|0gf&cV+A+;!evJO1iDG1y?8*!?_WBj5v#AGP&SE z3z0DH;*nUG%i9XBo-d;_zAx~yhiaj6V9;ooIJ)2%Ty!YObT~`M381>MU{k`QVD;>o zG})L9@4fW>x{p?Tw0ge&$IZF5d$jsJT5xYZxOeLCqnf(g-gNT4!0o{2HR~SMto!~E zt)?Si(=m1Ikq^6!PJGgpnZsI5E1)d48k9rIJUF{MeR*cz?7q*%7MvCEeQZ&1de%_I zs-v3kSl)Lm=R3wv_LgNl#<5qhF5NpbI6Fv%x3Xl~@u+s$?Z%7~Tiw^cFXuMw!e{O^ zB=Z~ijn>kaZ|T#T`n1~J`P$vn-baDjT%ZS^^s%g0tKR@VHdq5zz#8#^dBowBE3>QQamFGLcx#ZKYflW!S&8;n*4!KyIt;Pe{0F6laIqsq_A zC8MAMKXy{mi{s3Td>Z_ayj{ZK`cNV{3O_ZbY?NX9Rg_ayoWtRqzodrR&V452C{`%^ z0vnC1^y9G*$uP(1)P-KlcSH?Du?Qm=Bo#Pl%tHA#TKNKB^$ZfYi2Xt!IJ-05GuxMG z*2Hz$kS1=N>(l%@?mK=`@xbvDzve$Mb#Os!Dhduy<3e>!dfoJu>EMD`g|p$_^gyQn zz1OpFg?_I>YuK0*H|E?Mx!K@jBQ5yHqsie>9ynyeI8L>cA_@FL&=@7NDw(mU{9Qay z{vIXMNK6_bTZg`kM*c4V(1>G-qBj;FoH|*)%Pz!mDq@OLP|(UKOn+?9O7hUCe%bcI zM4jxii`3tB$exrFsW;_7T9NV~^`+cMD?wp?qBOu_%7rwL@*)kUDv(ym)ktgPTBLQd zA1NYZkv7PUNSDh^NLNt$uH#MgeWUV*|Ar_xrz%taYJqfqsuDtZRp@`?p+SWV-JrfL zAUg8-mTd^>6b6xAaG8fFwu6F>IAO!>gX0;Z7?RHRea@2Zpz?!YL-?1waZG0rw{#@Q zA_(hX_*Ehj2?weC=*Kz@HVT)XBI|{b#m=m)!j><{I6ywZt#Dw1g|evTsS~p=hZC`h zu{g|nV`x?YiqQbKLrVj2tkNYBb4Kxgej3FNMS(`zSd^648Ih5=)TSmC9C5$O#fwCn zDTx-r3c{gL@m-bFHk3%jQ2HR-WI+y%^J>qsBVp)TyebT{aMG3d)>zh)oNEk8FJ7am zm3s8|?b|1v?LRYk_}K9mrM-}02nZtj+Ka;o7Tneq=5^XOUO`krXgIvF-NZLlX0Ats zrgaS`;v>!(p6YATNgig)a$W2uPTY*u&O2$TQ%wc<0@;p zjm??~sJ+i`-_l{yXEeUI$3E21II7z_Xxp@jnzfe$&!J#wybG>InS3N|28KE$RoRON zC8X2-M&Ax;6O}H>JZ~GgzAjY85b7ga+R!fV_Jkg-HGo6R3)XBdfY?D$ICaH=4T$lA zRrf;IQdOeyr&i&cnL3Cah3rLzOwU4qhluegFyWYhZw)v;cQ6Uf?OpYpH1dzpK z^$VG@s&N1uf?hJj5cV&m=)Ry13d@EPvE<|!JP=lif*%clW9jrb^4$;U`osN&We^%X zhA~PHSo~__f404O>rZ~s{{o^~uQK&Bt|W$j{>}P7IPza6-`QWNiYHJdF%cOZqz8(M zF1fksR$v9_9>_Kth*wTb48_9j0j_r~m9&n^wNgU*i)kpXwU$#g=g5|VcqI`X4#Q0n zQZFgrLarJ>Vu%|0#MGt(g~#Ed;L?8djta71t0f z2~Dfg$>LhTn!9o8z_gftt+-xjT1ntkN^{pg7K2mAn9VU;two5e!&$F}#7waT?*_MN1kPy$Na(U?Ceng^uewOA| zFDprQbZdCGNZxT%--VY7n4l|oc_$@d>Ri-S8KRCqSLoPaE0|V22>`yX)$CYpWzM&9 z`ZALAVI=rquN!Pghi2ZGedAk)rw&fn(!Toa`zLRo$sEkqesFlMHrKFOYuKDe8tlji zI}r0L)=s^`ghF)TSHJp|t}njWk5SXfTcIMUM8_x!kL^CH5tWbaD74rx_DcxY*g=z` z5KrazkPPF?E|Mu!fytFk7`R=o;FSZ{68@En((Oj7LS|vj2%AC(gZTC&0?U)w9J2+G zLWsq}HYJRbYzxEfJ3X+B`}Ir3nZ%f;EGpF&6-EM(F4F|T!ZVA#LrRR3)es`vdO+{q zEQPRnD;CxkuG+aBkz%|}39F6GX(^DCQWVoKe`iI3~b+<4E8) zL!Tf()gJ4*rK{KKFA@zIA+@$nSa2;)q!7BwTonKzmROXpPV96<$UqqJ_6|-Y;w2ju zGX&SxKH1H*Ahb3JBi93Szj9YSPYkjHEg%+djB<=h|GSwr1!3W-WLqA3Q{!x>lCJUB}|{ z*QG<5wV5%kd2_CDbH1)a6Fc)_=X~WuaaT^XDv*(JLyKRkqsd-C~H5a^u4$O)$)uAJnG6au~K8xQ7WL;PhA zaTuHpN(6qW-$iqHfx?Torbr#!#4X>v(7+`|&&}IJM^hkOneRety z8-D7b{yr58-gM0!d9W$>^}Uwi?GO4*sB1@!)jAd z-1#ks_DeCgdj!k zTJEER6vrDKj?!5zWE|F;g+r30(u4DLSCLJMZ|T~#<5?UiXZZLb!i~^JSZR8k6p01w zT+aa&%14wOMxwfqn1TZPK$<^Z!c)eSe~c$rXt`Sf!2Mb;1cHBGP4~=PnZ0sr^5*1B zYBr^bD>E|U4~^i%2#Vs`{opPS8!F&kRLH9<2zB`<$hR#mkt18$?*hQT3P6Mz_9Dok z{RrY6N*2>PWVcxrWy_DfjEm|Ld)sQ$9NC38U04abov_49Zg8^V#ZI(C410pTV)F%Y z7I$#F`AN?|q>SIEgu()h2q2m#XDf$NaD>Wz-9Eg%$npD6QSf&NzefS!#UG!4`dX%S z=7#2Lp{NM|Y{mP_(gW``-EPXfhOnPgT5aq1hdvsa8<<~v_ssX=AH}tr?V4{#-nWCo zf2z{8GdrhFa53pvjBS&VX#F|1hn?4)ml&26oJDd-4a6M|0S@Y?I--FcNBMs71|m{` zEzjhCTau+nQNu`zODA6$FjVtX?4)O7ErYl$_BsVo3Ez@@0Pi4z>c~bxhjt61bL&n- zwm(B8FAepHtQ7qa#yZ=FOt!NlMRPFvyN57tj28zwckSX5U6}{=aX>*nJDk zQ{lehi7zkZXUkVnzC$p|r`(`0XXyW+@sr^YUTI+VG!0UZXflE_=#0v+m+=mWF(LRg z4a*sNu>*l-Ox&AmY!U^VKY1e6SmLTG;hokv49O6kt}#8qx+LiUPl7*?CfASs0~zcKfcYXKBA+7D}z_qM{v{*s(`WmN1#b=M-$z0 z=3>0|E!-~J>iZ1exAHIWI77_!0st^qGu|LC3LekepH(-`reOW9%vJBus(0k8cg(nP zaNIN9^9!#~Q#X4xeNC&D^3@X0w$78Ga7oW03N1Pw9>mrWk>hxuz+@$2BW!-TBK#ld znrs{dw*WFO*&cFjWDBL+a4P_&oOSG~&d!ER*+R*Yb8l&z)@J|GSlMN76SZB+M3Pjw ze%UQ9gogx-Cy4kZbg=6pR5ib{C7XOB4_2BN_O!?D6mTFnOZZxJEe>!=xh|SxdQuPB zl95?W+Q)C7Edpms>U@RS(0_SYG#i-&nkO`dn@zRVX-UjHkt0e^n z2?e`f;j9ti!uA%tVg1EfC`A@gBE8Ys(;@Yor+!ovwn)p71DRgLCSDO6NL)kn#Nx6T*V8& z__ri#IDDz#9h$(NOHuwgiWwWOKc;#_@5%#8*e2eG+RXmL{}2GgYc1Y+`Di^@%<^Zb z`Ujv7^-lpnCgJ2_8Q$_3&aVHF^Pcxd%DpS|!*{=V@0(iN?tI&BZS|h~>OETH9<6%s z)X~S)%Wu7P^Q{k+I}lH8`RcZ*qYD+)MZxXqdDO7#y|3T?`m`VMH?zLHujzB&>W9A7 znaR1`_kEhLKkw_$`T7@pb+@W-R%aZVuQ~5)#=G_lzM9$U&wVWqeJz=bznA!KLR;OP zU)`a1%S0@6b@1}A zDGO=8M>2tJ3VfI{H&fa1gp)?O)$9`qkQc-#+nrE;gM?55HRs?v{gul}(Y;8>g~X$KZFGVa;&d_v;5VfctOnMI=z`Y6wi@QzE}{)kC{`3Yi0FaV z7whAiY&!{SkKXXKgg1*8xI6*tkiCP*Wp9_Ykcc#6;wFc_*@7)vDdizO>Ok^@rNW!; zv{e5dCH^%nRX+fP*V~qgHj-)AOkg&URy5y=Oeov>!N`y5?lsQ4?ykJIQft{he_3nk zMch4mi$sruGm%XsY9c$18@oHr+L@EtLMXwzATGPoUiu#&CDE2~S(d+@<$zlEi72Kd zAi-|yU!nr6K2Gh!vFJE|0p}CS*5Cgn3Rh|MOy?)3=fR%mhoVVUS-_)*ge&3`ggt`b z5K@+&cyCu`bgohpx8}vIpNqTTl>BMyPueu`U|u|!b06d~D`+g|Ahl)Uh`Mv(%LLbY z0NjWpq5^TTw_@OT&`E~-IA~^}N{r(*oj|0L_$Q|JBluqqc;(!T3ZfpDRQ=^!WmEB^ z8DqR%(kte_iZES6P;Oikb7SCwZVgfoJz^_MXg1wdK4W|e1+xAJTGdtnY*&nbP~n<6l5u4AYE5l(Jz5j_ zo97)%{ojctmX8kI88O2tW$7O-EodPb&u>2zHU?0-dW`RB>$Btvmd#N4a}3z-gu~n3 zMzhL)!qdN?c}Z+uk43yAKeHpOrCHbN!fF&fUp3@jc3)qAasn;3~dO z5zLC4a@bzU1A-dA&IQjw!6MN{sz;Ern>C|2vVb z1WAQv?lC5~3tb3F$2SQ<9{D|M-J2F@9v5peHgV-=>5DOW1KM zJE&k@XXcA!P7{UvA8endkYU8M6qA39u^AmOq%rbg2w_Jczx3@v)Y?QWyifTVssW>j zd&aJ#=yEt53qoVg{#g*3a`w+-p=yf#EeLfvEqgZM7q$eSE#N-pbLRM0xKElCs1TyfI!|rjo|gYa}HrGW;>;A7B>lN z*49d}eK&`^Mjq@_>WvXo? z>65q5vAel2`}wr)vu5jQ2~;KQJAm~KIM!K7rtS_3BWvFIV?P6Oub#}miUw`P`0Z+8 z!=?wU=Uvn+PcqIssEE7yp&w%w0CHzu&!4$ql_-I#&_W3=2vxj3q0adeG_)kfo@<}S z0iK%>ZzO5_8y|M}K>36N)-w)O2|Id;Ft%6;pDnWHUSydMhCacNZAJKPKzOeG{xWI@ zwT;B+a?3mj4w;{B{jAS=S_0K02#eUNv6(*gEemx`3(J-lYnvS~AC2U3Mddq_MGxI_ z?jp!1faNL)@)INy#951=Dj`@?tR|>N2-foU>N&SHP_EfFhxXAh_!#&Af;Em&5P+5H zV_+Fx>ac==d=w1yQCk74$xDjx!U18|J{k-h5Jr+|y@Q@`H}7FTpC0(^ zCF^Mkv|8A<^HYd{WkljelEEAR&mCkx57x7v-1))$`G{4h1nLl8a2#@c3V~khfT%JO zJ^|ciRhCJD2zR+q*eX*3wRs(0&RhWIPz65^>ON^^KzUNBfBl5Gi|HspJam+ugU{efpf!=bUc+Wo2c#fM;@IYGU|JLHKVJP#;Im^YvfD z^PwOKk|iXBEu$6-`K=*q*fwel+ehszYzsL?9f-GuN=8fB+c{ds-sS9V4^@O+qb`=# z5vmMVjaG%-qwcV0)Dx~Atq#|W)`V+EYr}P;b>aHa`f$T&L%4CYG2ArT6mA}E4!4Z9 zgj+{j!)>E&;r7vXi(nOw3sTAZf>avl_}C^0ck#!c(N4)WEO?z?QqXHrN)KI&2Vy7B zdaa86NN_x^I7VisLxCZDsv3^_Ba&a1hR6M(fb8`sW&ZHAFCO(tV~TrhCK!@@;XueY z=8pwqO8Iy+JUtT+_$K<5GHHyGA+=I21p?vei!4dyL@*-x0ugCC8jQr1ib***fhb=n z7*?F2s9*A3@n2Rds7&OJO$Wx6ns6`_3dREC6cY+WCgT^CifLId(=CT%zRS_jOgP|+ zMCGtjLHQ^(ic##b_(WWB#pPhw7Yic)ctEKzq#ng3S;xq>NV1|Ax(|cZp>LQLLEp`yGEK$q^Xm{Rm)k2H1$%~s%h3EO@q|EYMKp5 z(`-06VLXjs!Wj+cGhC*WeVn7@Z`C~EB zFHd5%N8;oQh_OIij7LQ&FyWsG#l_41&`cmkF@cDGEEEv^;zTHLHAvoIBp46+L%~^p zJQ$5=rAH?WgLSs&7!LR&!N}yqOo+AGpKG1g zu3R4m<;j@dhZwM7DBzR*@qpL^^tVNfPE#}dA+Mf@IvYnh=#!s@Ok}xGmq_{g#3Q_y ztbaq%NFY}eWs?KE2C-)>8VxK2* z3IY~l7|7VF!xM^5`X*)~;>qE+5cm|yq!!TnmE=EJVN(Swmph|f?UcH|a7?Xf6b}={+ z_l4IOQ1_9yvsa!5Hum7TIdc4lRGC_7v-8LFL*IK5OlA zVB#4e4JiXM@jn1nj5C0fz}9~`8kEE_|M;azIXV-Ou$;kc^@%4V;$$Eakg<~YZSO~p z@u&>&#KHiiARmFJKPaooS3ye?fWugDb`6|&43L(C0hXAE0gae>=(qK6A22Ml>F5<; zgjI3rQFSddvLO1Wv1BpfxgHo7^H>0N2E^l>B|RMso2$-4wgwJV?3n=T`gC;u1D#pL zCkS+PurY8v0k>&u#jQHT zYK7lZU)QdXZ6qiCYfUOkp<))sF&X#;V)~3sf(c-^n^rV-s=jILfr;@Hdt!0R1O28tX|tIV0GF{S7=v#GcG&J=yNd0-k%2V;)6lWs zJ0sJ5NVcv2OTjJ(|DejpzN~{!Da~QokuUclINqgx#k&-L75MwQ0ixm$g@h#`__WUn zD?%1En6M#a)k1cJG;wXhF>R43FDmCxq68sT{469&5h~HXl_NwovnNr3kS0(}xDYDS zzEvWm3Ot2G6GGHl_9Utha%mwqLYg2kQJrf~mG->`-`!fM4k1W)_>-tbs9FuhH>f^A zNYsCf{@>NdK=9S7UlR?;MdQn!gy)7;vPP;*xjhBB>$UWa$laiYnh|Q$LM;e2X`xnx zG>K=T4WSn8TRTFnTBrk|HZ9bNP`efq5$e!FcFA^$&81uuKcL16!j!1`k$Ro#kF+Q3 zl0BE+DLG8v-0`DoIsw9bbC2J%B%C+4#*gbM62g=Y2f~yNA;OfF=w9i^fGk}Z@FrS{ zI=j^b@N07vsY*DF;Y67!r9*OFVo(|@KP52- z0b2ERzYu*Uk%vw8~2Wlmuc66=#a`gh%DZu=48tsWYe z>)qEkn98e6di6ZIFVUUY@G;WgWwTE4`SsrP#$Pd{H_V5z$M9<C#vu#B)6z>#&<$Ax9d%ma((0VP1k#_d#`sPSFbU(>Q8J;Y)bSb zydMMq+|_Up6jg*3G#nuj6lp7vg|rpOLK^;%gkdPi_@CD^BvY3sd1WgHXG=J;m1$9a zN!V$&LwsZ1TjEUQQY3mMB3@9Xco(>wx6f-++^2r}km!v2BP8D(3y2{S%F5u$Kyx*bMz8pLyLdS!&WhpzaX1iXo41*1agd2_ zHPLooU*BxGm_V9$ukU^Ly4NAEM=KPkT8`pWMWBjP-Oj1LU}VJGmrJioc;$^OyE+z% z%h(gS3m;{|FpA449g=BbD-I?zkqIR$)dkEqhDsMJC{^a2zf5aK-b6Xe)vZ`CqWk?< z6$jgBD`ncoNvS0Lo3^V#r*eC8rA*t5DDFJYTq(_Q$V#P&iB)P%d>v;B6^|(r!>3dr zM&-H`Cp3|arBTWmBf+>3#TuDbECIz1&1FC-f0a}QM`alzE}~pYxeftJxh`*1?3ev= z3_~cB^3s8AM>wXGkfg#Nmx;_OWi)#^Z75S`vMLo9R1`DQz8Ke{ZiZ4a5Sn0fDicOw z(cTole3E)ZF=PG6x>G6069C!d7hj;a(0um zhn&6Syg&|dFYJ~a_Wb_nR)|LB*w$RK2rJke6x*gN$M+7{qUz~SM2YBgl-5c zr!7L;ra61Y*_3S(=j=Bt^ImN|skYw5K}yk_?bt%^)@)}Vz1y;FJv=^_v%UodlxJHv zQfyOp-FA9+Xkj&XlekcsM2UJ`E!_yYv;6hAxntp4y4jm)_9C__+rF8KX~}kON_B2s zur1hr^U8zWzxLhtu@aiQSR%DWjU5QOv^?u~QC=}C_S3svYf7%cyuv3Hwewe(D>_pZ zosXMZZrkUM+&npVGFw@{(6U^)K2^Cs+tkA2zNl$SweL^Y9LUrhNVyI?si;p?w12U6 zFe`Rv#DPyoGDlxaO-yAbF0a_E{Z(HHaK0fY?b?C@&Mu7H_AU5U%7soZ1+yJJ?B#A< zh%LLjQvBDwFYWHiF#i*m2btoxuP?i{q}VT8G5#R_@JFeNar%wCmKpJL|5rAlvNyxh zd{NUdAOGp*Tbt)fvlTTfg01ZN$FAyB%|P0XQp9i+OsR;*_C3yFPeL@O>LQ`o@{;dt+(== zm5P#<@~;Fq-;hHyv*Hn44J$%P*`|f(*ekp5#s6^jZ_a*xD!uP~X5aa=YtP(?`N@TM z7f(H~WL=&|uJ&bD`@+S=(xui1Q)$=9jO%2|bu!zze(p%d1&|XlW@obP>+l-Nc5S5C z&J_EtRH7DEhwgk0P#2UE?5v^CR7ws<-ytlnu^jkFYz47&U!R5pn-(x}mj$^tt}*Fg zLv4~ZVF?m@r^Vi}{Ma%q6i>y!dF?}rgRM__2t|J)c&##V;qqH>;2$MFeTh+XpvpXt zm*Qb-cS6Qm=Z&82%v;6z3H~P)F$m{|@T8(qi zoW?u%Brw`$CQ|;#3Ck2?uqlaT)%Gc1wWjQ9zTDQK8*y4`$rAOe@r}~7jiH$l)Z>TI zm#5ZA6plTTZ5CTaL(STz@(-ywuCb5kt=-&0Y$s^+P%D3Cv7y6wQk=$u&VpSuhARYW z<#lYdbt@S8poFcq&Q8x_%o0}S8e>rs%crs6^13)NRJmtL+tAWBUN5bJ>epCx^F|Bh z!4kswo!6UJPwUg_!Kz!UY^*`A!+#9RawwQqH&<#yoxaBSH>DJoasGH*RvfX5eki_(wU@`?C}nX7*<-|BDwYw>y;rIn6>F5@ zG7cMO5g7xi^Ms7+dyz`B6Tj`X$!Dni3lvi_%;b_W!ZDZvj&sYj8Jlg&L$=x%8$c`u zW{G&F7NN50C&xZG_9rK9o|v;_E8Lli)~|%-vidLFAiWE3rrnz}?#&=Vm6t5p+PYiM zFKoNDH(7V5eYv(TRojPQR~2R>%h{>_lMK1douKUAyxN6w!Y!k`GuEnjV1@~ z>|ffJUO%v0Kai>)K$c8v@1xe|ms_8IV0maux9-c(??9^Vz!xnWQ=5Y6mZ?n3RH|<3 ziK}wHE>qo=c6H4SW*Zt8x-%`$r5m1Gs>^IUlfJ-aemg>YyGZtCsBdE5EW?I~u^*bTrH# zTPYEmI_4{%G`HVwUu;M>Z`HC|st>ZP$yXO&rd;Xv1DW;%^T)D{?MPo+H+O;&oZ?jG z-+UV?LwEvH!yizYeQQz~>r*lQccC&?otl_&xE>EY`1CrzJC%8+n%AH*1{x#3hLO~$ z3n+jl41_RG!}5#*Xmo;604-EV0Z^j$brw^EuqG-6y#h62^hr;H7@}gvbd-;%3F0(5 zL~)F|=Sln8a#L&N|&=Q8x84-~s%#|E~q{B> zok`W5$wU7-W*;zM@XmF&uV7M;@L@x1IcaBy;BNbnDxNg|*I| zWmJKQV@#xh7;d&vi(n!?G;O_fdCzk5IJ9&2gacZ) z(g_DpTBYBMK|h_-{F9)|%&HPY7>vb(bk|t{9GD2RYsV&&#;;E#m4vI{fk0UXkRM#1iY>AWoc{qmtx+ zTgAqs8lO!3J3ToDosx!iE_wv#NL(U-wOvyE*F zZ)X~}%(4gVh!UQ)p5vk6TsEE>!oK=_8CLf1p12zq8k5g0&OX?eavx7Qk8|u<1m4EMS7{HS z5r$0vJv?S)YDNZ7abeK;0~Dh9iz8J?C-RW5#e}E}%GOXL%GUf(DH|xsjM$-&qy4|t ze;&;`sUCB{;Rv=<6r*EPumy|pwF`4C2KkW7Ya4WCKV{%~3OO;_3^QzJ-teCx1V|=m z_-@!nV<)adwMZXHKaejh7&)?bR)a<%FS}MX&SAhNhk9ycFY}t9Zk?aniZx3a^4jGR zOn+{B4y6wI1lttGivex0t#uJ-ixIRLmiK7{XIt}@w^2nD#T>>6FWwM-+aR=h(=D4b zEfD=fFW_|DbU$*gTXwEX_ADMxI|nk(fs}LLar<-W_TEf;FV_z^UCMtkTylP@8v{gW;*SrdtoDH%3xtAxWWKg#sEgpnz^)(TO(Y@+j7_ikFx2Wu&+=aC`mKT#HwdO$P!eFON$|8Tyqp}#Y}lD{ z?@GIOW!$?`&RvBfh_!$MW%8#OP|Vh!!zlw9CRs+9=#$?=7SQnuytp3oefUiMtLLr9 z_Q%x6?z|;aG}Vu(pQKy3A$;L(UKmUs$#jDz?M%COX52ed&Yd~vzh;7dl8iGN@By+z zqHi$KkSyHHMXra5gw|oC*PevMEa|vzG09v=O0D5@_-Rnt1HVRx;3w3=9x&~Q*Nr@w zj98Q1%Uk2xzF!dHwQ4R%C5V?`4=4{>dRJ>LN~huG-waZXd$zp&zb#VtY$ak-AS5f* zGL7$q&0Kdwk31WfJsXqnq&?n@ z$2&Lp*yYJqxw!_kvGvxEvdpP(&eU&$9<-A5pp~Qttt_9jtW>e$){{22<*}#vmxpd2 zOK$t<f?kONZl}{lc--H7}8V#Jh`g$Tn$(EA*WAd@w%XGkh9sRaSsBHS>x!bQM z2S0lK_Un*j^`%^cY1d%JHJH=Hh_GfTJ7U2iBRY6z=t12>EUv?8&*6;c@Z8{&XIoA3 zpP}(4Y6rl7z+z9##gm zhCQ>;9hiZPEeDORN>&3U*XXI5o8I~09kC?nt_<7k{=@tPa2^!`Bj^`Z8idUiF{=Dv zN-qu&!hnAqHcE0IgHi^;UZ)&GJu)p|9h)mIpTQK*VBz)0NQO-Z;{s_G7)Dnt=koy4 zO`MVedCzv_L1f{$}Cd+P=B?26ox4e%xnN_{R5f)7e5_b?mwF9KROqGevz?%>BrXnDRXJ3hL>mjA?+vbOaSh4R+ye$RSjrciGJZ#cBx9Rc zJ`I|Rz#z=|b+cv|hch^5+@zs)y+rKn(R-bD0}R?3C-kBYgJujv`;lh`LtOJ>Lw1h; zX!${yRRknuLjOF`N95}zhhbR;lex|AZHoOFITy(JIh?%Bu2Ms=XKwH8x+36dPVyN+ z=nWKSgiv4F1-q`^r$gvtSL4DUgiakP*Y31ycgD3Fz^U0${Q*#`nl;iID-N+O|92qP zYJ}Lh#?yoM*I=^&K4v6OmU-cb3dHSfhpQ#rk)Rp}>EjuPhWB$GX&)2i*a&Ro*9}jHuYC;mi zOH=^UP|~GKTt_7a!_!eYE*`?M2u7oUw4w5QdNyWx-{)t?U2=KNw7noJRfIO?)zG6J zDDhw2ziR+?ifT9**}tu@+5Tx1VXmlW^JXf#FfFDWiT!kllix4bqmCL39p(M_U^pJ+ zePi!VaWkbhM7~wf2@Y1QR1PI{>0J+M(yP~vJ)i)!(~AHyXKGFI-9OopM>nECson&m`h_8wk=4vqf6E4j_s+I9b_nR zI^{W&_MFLh&Ol^S*+|kLT@a-ExVd~wN2X;9Ofpug%c_9U;Cw^QoOPu^@YKweF{1Df zKmUlHFApj<4D2}JH1*L)1Zk1;aau{-0quj^R=wrk%j8 zV{wt)S%^z&;uk?jp^xAf0{Ed002aZWh8)B2wNRjKD8v*5m#Zl%RMvss+D!n(?voOS zb4}4w!H@};nT%uAhw{$J_YOdDVJw+Wj^(=I!zGJF?y?b2PQrQ%c#_wG+kY;i&+Ov0 zi@+7^OdPu;49XGIi$e11?C;xcKxRd`DcC3P6;D5Pko6{m65qpI^5aVc@`^jJ2lAiN z_d{@?k#m{S>Kkr#p|G<43W>}sg|f}Inej`X>FUaIfK_Z4k}(aPr#TwuulDaDIIb&NG9wL1>Z(&<>DB{PBl4o!MO@xVnPRB4F7sLmSMl zy7TMM#SB?+&CoWQMKSD`aW)>K$VQ03>^7t#{Y@*4E5{jZbTvc*sWS0Ga3EDCUZ_Ong*1)^MbLnRp)MCpuLEC)50^X?0g>Jod1d@BwA}F7}bWavP(@h6+Lu^z!^Dphq{J8 zu`rqfjbQ`&DE}HxjJlK)_m}HefE1v9{ZkK(In4~9xsK@uQ|`{RyEEhNOgTFlHfHAn zO-R^|^lWfx89)eVy@8turr_t#b+fP!#t`|NPYk$i_BTZb97ib8TipFY*I;1u8K1^4 zlUYsC^L-6G)e4-)?#C)R>OSK?*r;KF71SkbhzNGcKEz-4Kh|}&R_jC(# zJuGHu~WU@X@>QbP3AxVeB|1U69=K_9Qf<)_;G3GhZ@M+GNGc>E-{Rxu^^;kFfQI(Pd zRd30^|HXK1(m8(DCo_zIX&o+)qto%k;ZhVW7Xk^CHyH`yzBzv+9=gUZZo!FgBIdz3 z*)qtmOTgW35wiDV<`|sm)${W^@eDYtkd0A6ub9oE+I^jV6WWRuUfCxOv-^M0bap!` z?Y?q@g6^0vXUV(ur6@aLM53p;Ib$K_92&u5y^p!&^vet1>}FCg1c+)|Md#KZLAI5~UC`D=36;c8m{9B1T&`mFxnQ0xsjW{D0fzu@4q zx{EnPXyOSn0ZlXsuBy2Mk3CJvo(I-XM^c_sY0s&Q=M>kUQmj3*`vf;(PI>%kk3Zw_ zLz_zKsLmC^UUt^9VzpJCwSb3tRJUQdZbQzQ-Ls?o{6sUP$Rf9q)*+-m#VJ_}sMj>BIUO?G^mfg-&E#cSQD*>%631-Al}6vf3BC9>@PH2r>l;W!~+*RFoL$p)`TsBD~JrL zWQSQ_hl^H9id%h1dA1=)Y1geko>phlpib7CWbTo<|DYSR)N+q*7qe#n(6?vlpF|oN zhyQJ0`eCqS9Mdz-jEkxn^5yEl;ql<~wN)`FVM+5w@O+NVqrzhFoAD0$Z+e$0L(b4M7;yS|NSFe5A$x_UmgFiv+o~dGL<23bsjQ>Ld$>E zAUr4KJ+9}7#JBB|i5bDsN}2@48jLEX7;Wm9;t_sG(QOk<8&!1X<9d~Uh%xy8dvb@$uTjR!N02S44AZahABV(!F~#+HS^UyLty z|HgX1#DnX_K|DdvTI#(^IvZL)z);^4muL!8pyZ? zQm%o=&dQrkT!Qh)*|F^GScorfPi;7yJeNGTG?3YG_|w3%?}*goz(p9dRhKD24HK}^4R(2lGcdobI2~}YAtTxj zZEI;x9ILo+sk={YhN?24M5Yb$=ZGlSARBX1>YRb&Z(V|?G2>oeWKZB-H*{=>U86L_ z9+^x(huG~$JYq<0LySzfn2ih_lRNM}HZpQ-2yBsM1g#sTuW!IXPl#}ME!gx74SCno zF$fw5pHC_G(Oso8xYXVU69(TqGyYKSODT>@M8|zT`88z4{Wo-&WE@AdFGeNVMf*cWlu?95bhW2m8I%RP< zSD-$D`zB5xG-sI;=N`WHsr=d1)XT$}mtRl4;Y;oCWqL30Bt_iaRgMF=OXha@ij91|Tknl> z-?0;)O>y7b7cy^OWZoxbl`9S;TM^7om&@^da{C?h3ZA8|`{)%s`A(bFu`4Ox!SND! zmd5T+F;Biz-ryMEU5AHv9iA1z?DV@G14-+h@)ZG|Mfn~Ig(u(XDsh~#EN;JthLP_< z*F&_7d_`Oj#q+k4@6(#kT8gCNxjj{m+QoAiDU3tygF_EbF;BiDIvr@A=G3io)GQpj zjkNIaM!~~E2IqvOhH}A4*4#l0;Q162tVSAqJzB}EjyeW@We#zE?AFwZ08jGV9kdW0 z-O*r?VjC)Gj^OZ`;8~hs;m@sqhbE|B4ZgN!M>WAvbrMtyuSFP-;`N~BAzG?_Gx&DZ zIcT#&F7G+{7RPzaQjEV-ue{DZMcgeVXdSwb*WxSe^;CzW3_=uMQGSRQXE{n>m2LAJ?>jMZz5s zF?I|^VgFl-K?B#v3fE5y$7Eth*v({2`NOTwNPNMOBGYLzEXa&~h|A_CJ`6oFi-bCg zW5|=li%JPUN5qk%8Dak3xsTI=1M+`HQt&fE>;hbD1ud4W(2z3!vO-6y*zbu@b%XuO z3NMtwQrqtgPp+9B*tyrrqI3M=^m$T*dG7DBjad-aJuh^Gwp7Nc+-~R#m CE{@3n literal 0 HcmV?d00001 diff --git a/TTS/utils/audio/__pycache__/torch_transforms.cpython-311.pyc b/TTS/utils/audio/__pycache__/torch_transforms.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..09048350f36215b59fb17e42d988e6a6e55146ec GIT binary patch literal 7009 zcmb_hZ)_CD72mx-zW=_nv5n6_vS5mRiDSnRNyG(6px}fiF(fu^WtuGO+d1Ewz1?GW z5C6%bQi-aU$ZARI2tK5$qpH*hSK>>heC(%w_MwwFt%OuHjUv8OrX*67PknFpZg0^2z)Jm-64k?(r0Fc{DB47>duv$OntS z{01}<4z)~qWPZvU;r&^8i~<8tⅇuUVMER^t1zCIS zq4rRjo9Cy(U}3@vP3kn0o0uG%l*Ay%P3U`fu7Qz0uECZWQ#WM*=%7ly-O0m>0NW2EgWvYMF9n!=!>nW;4E$3`q;W0S(1 zqRIN4U?_`(Nk+r!VsJ=KFNiA9X3X591Zc(lG?)SAXQ`AYhA>UcIYKl6r3e{aE#@`D zB@L9hr40#s0p};Fn+#*G*DbTPyMz=ZS)7wD2!qqQu5KyfL{^xjMIyjx*V@i0I1zh6 zo^wquC5#9!q=toKBr6qF(-2Hu7?V_kxb)WT88l3~rPfK$w`_f+C6)k`)x@1 zf=Wb+`f%5`wAPg93^D6&$KFDx!}nVCjO&_vA@W2Ovw1~(yl@zxwSB+Rv2^%;T=SVHF%aNfuIv#g8YD59y0^0cTb8j_F0 z82A*>%@ngJgB7|UABFR-T_6-^itb;Rxb^04K3_OeoRQG4vZ^j^?Y3-8{lExwIe;5m zGxMhwqmCVIU4qf!=UoT7pwEG0K0(>>`2u*qW-_-$kj_gpnmudR)P@8yJTp8bjIv=w zBXnkANO*y9ObrY$Gs8m5hHN@k=m0tM+!OZTv_XV7Cf`KKfEAJe%Y|N3KV^yraRCqy z)_+2kG_v5DE({YjcuN>W#Y4h?!A1-KJW#>*-Yt;=(6l*RRxT*G>980kRU~Z&?4Znw zIik$u06nJ{Snr%71KffwY~8@hX-Sh?BG8}&uqdEalBz692$m_A$ryn^H5f;zk{N<9 zNQAShGEH?u8V3AOha0bwv*?5lS4U18de9nOD8jA@s#suyhFh$dAdeA?;TEA>RRx(C z8LF_5g|jk>g@R5^A~TV~1xYOuqeU>FZ$^cn4mfxA+2^#vu%czpZl2)D#&I1IR3cq~ za|NoJi>pFdl@vfzNC%iH1tOiZm7x*n(k))K$KKXG_7umgh$~2pQx7C6_f7%%+D=(E zUbWin7cw1o3!?#`m@+;12hVYz@>e+%o{iF)^sKmg8Y4Kabqvpy_!1|ga#l(n;84EJ zd4cmeoDaB0ddQX9fNOL50>GhdtdxSlK?+spLcoO_E({z?Ke4400WRY7MS+VtTnxCF z!?gp~?r?G7;tkH+-KcTgtg~7k&iqlc-Q3r#m;BdZ#y6X@_Fjjzy6LQ!%&&Pijo?H` zlNQ`3`hZRZ;5#p_!>b3mf z#)MiyQG|ePilXIb>5&y~rhZl!a*TT7W(Df`mlbj48CDR{1(SkCO`5mb?67GC>Y>ew zxyi)5PA@|&1OTWxmsV}(%vv}7oT z6?RZSwi4oWQBh?Ktafjz)AAWa%~ITBid#z)NN_vp(?~jzY(s)uL2(BtEzg+wuO;t3hq!|eN#&BVQqQ@?FI(LcT#`e7i?Z%ftzPrSI zyN?UCe;8N_+zIyH4)#`pebr#!da$q7-B9#;Sp~u8-eq;Y|G=ML{_Bgk_Eq{1RQnHn@l2)v zc(wod`su0lo->u6Gu56mOl)!&lT)`R(z(I$p}nF6(7Mr!&l2CNxzyHb7t`Y)JKebzt}_b}epp$H zJ|v|LFg~{UH6RLHt%%^tPMzC1CPILkkcSzNwHRjBbR}E%R|7X_!9{D5T~kwO$TLBz z0QYW!5Y)VEzDvv^)!=i3`$hY$^doF=>6^$&1+wYI`LuB6s4y=~zK2QanRH$^{gNZK*3VYtFp8>WMh`WleE z%2qpNmWz0kQpvoX8qK7ap8wZx7mgpk!CPL4W+|Q_eky7CG`PoEp`vEMHI^(A7BMU@ zT(4cB_ZMJm*+K=u0JPXd4Zd$x<@o!uFDT|*l5NztZLorpsSf7C;QxsbR zh~1Wh5yl6wu$Q=vUM?D2dg;nLAH7n3rIzTp8u=u0?a0UN%kAYOwMcyF;0EUjo#gNK zB(LrNEWPSmi&u8M{KaUc?_j0pP_^e!`SrVr?&V-L(SIi~a62)u%CEkD>u2lF9{T%< z^~69WF5pUOBiIB>!G~VL z=|-WB>>91^8of1INxWE1yvSCq8rg|s57iP~SL2_=SNPS*wfC>Y zD~W^E#6cJlPn3fcv&alOccWl%V1eyFP=R#3(GJg9XWWVttX$m~s_;!#p#I3P1?|lw z+ZCRm-3d@ocS6(}e;vH)rW3{>7`ec2)(JqH!EeP0Ee(tC*_yIKqU*psOI$tUset7% zsD02-kL6>jCQwK-y_hljFzHLRTY>uR&S2EDq1E70y!3 z!ixFAg5_slA=ueVdDsbKbpsb!5ZFngd!WA9=gPu1rJKKpej^Ly5_h+2$MVTLT?4nf z25Rwb)%er3&YtCGz^#{GsdaQO@3KjvYdOkl8*#3C@IDvp2KNS14t?9n^*voprdCsx zbSb1ml*m`obk{qojM{k{|Bo9^MhpX|!>%qehPaUVtrj8bE zHTBY_$&^^`22XV<3Qes1-Hf(xhbI3YF7kifc|!1F>TDOG}1eGyq+Dq18Y< zdCm53%xftUG}EX6fonzMx8Zr&AKc)Olz6VArxxE?>*}p_bb3q1^zdNLqv;jTa_Bzm`(}#~%#{BD$uGVW literal 0 HcmV?d00001 diff --git a/TTS/utils/audio/numpy_transforms.py b/TTS/utils/audio/numpy_transforms.py new file mode 100644 index 0000000..af88569 --- /dev/null +++ b/TTS/utils/audio/numpy_transforms.py @@ -0,0 +1,485 @@ +from io import BytesIO +from typing import Tuple + +import librosa +import numpy as np +import scipy +import soundfile as sf +from librosa import magphase, pyin + +# For using kwargs +# pylint: disable=unused-argument + + +def build_mel_basis( + *, + sample_rate: int = None, + fft_size: int = None, + num_mels: int = None, + mel_fmax: int = None, + mel_fmin: int = None, + **kwargs, +) -> np.ndarray: + """Build melspectrogram basis. + + Returns: + np.ndarray: melspectrogram basis. + """ + if mel_fmax is not None: + assert mel_fmax <= sample_rate // 2 + assert mel_fmax - mel_fmin > 0 + return librosa.filters.mel(sr=sample_rate, n_fft=fft_size, n_mels=num_mels, fmin=mel_fmin, fmax=mel_fmax) + + +def millisec_to_length( + *, frame_length_ms: int = None, frame_shift_ms: int = None, sample_rate: int = None, **kwargs +) -> Tuple[int, int]: + """Compute hop and window length from milliseconds. + + Returns: + Tuple[int, int]: hop length and window length for STFT. + """ + factor = frame_length_ms / frame_shift_ms + assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms" + win_length = int(frame_length_ms / 1000.0 * sample_rate) + hop_length = int(win_length / float(factor)) + return win_length, hop_length + + +def _log(x, base): + if base == 10: + return np.log10(x) + return np.log(x) + + +def _exp(x, base): + if base == 10: + return np.power(10, x) + return np.exp(x) + + +def amp_to_db(*, x: np.ndarray = None, gain: float = 1, base: int = 10, **kwargs) -> np.ndarray: + """Convert amplitude values to decibels. + + Args: + x (np.ndarray): Amplitude spectrogram. + gain (float): Gain factor. Defaults to 1. + base (int): Logarithm base. Defaults to 10. + + Returns: + np.ndarray: Decibels spectrogram. + """ + assert (x < 0).sum() == 0, " [!] Input values must be non-negative." + return gain * _log(np.maximum(1e-8, x), base) + + +# pylint: disable=no-self-use +def db_to_amp(*, x: np.ndarray = None, gain: float = 1, base: int = 10, **kwargs) -> np.ndarray: + """Convert decibels spectrogram to amplitude spectrogram. + + Args: + x (np.ndarray): Decibels spectrogram. + gain (float): Gain factor. Defaults to 1. + base (int): Logarithm base. Defaults to 10. + + Returns: + np.ndarray: Amplitude spectrogram. + """ + return _exp(x / gain, base) + + +def preemphasis(*, x: np.ndarray, coef: float = 0.97, **kwargs) -> np.ndarray: + """Apply pre-emphasis to the audio signal. Useful to reduce the correlation between neighbouring signal values. + + Args: + x (np.ndarray): Audio signal. + + Raises: + RuntimeError: Preemphasis coeff is set to 0. + + Returns: + np.ndarray: Decorrelated audio signal. + """ + if coef == 0: + raise RuntimeError(" [!] Preemphasis is set 0.0.") + return scipy.signal.lfilter([1, -coef], [1], x) + + +def deemphasis(*, x: np.ndarray = None, coef: float = 0.97, **kwargs) -> np.ndarray: + """Reverse pre-emphasis.""" + if coef == 0: + raise RuntimeError(" [!] Preemphasis is set 0.0.") + return scipy.signal.lfilter([1], [1, -coef], x) + + +def spec_to_mel(*, spec: np.ndarray, mel_basis: np.ndarray = None, **kwargs) -> np.ndarray: + """Convert a full scale linear spectrogram output of a network to a melspectrogram. + + Args: + spec (np.ndarray): Normalized full scale linear spectrogram. + + Shapes: + - spec: :math:`[C, T]` + + Returns: + np.ndarray: Normalized melspectrogram. + """ + return np.dot(mel_basis, spec) + + +def mel_to_spec(*, mel: np.ndarray = None, mel_basis: np.ndarray = None, **kwargs) -> np.ndarray: + """Convert a melspectrogram to full scale spectrogram.""" + assert (mel < 0).sum() == 0, " [!] Input values must be non-negative." + inv_mel_basis = np.linalg.pinv(mel_basis) + return np.maximum(1e-10, np.dot(inv_mel_basis, mel)) + + +def wav_to_spec(*, wav: np.ndarray = None, **kwargs) -> np.ndarray: + """Compute a spectrogram from a waveform. + + Args: + wav (np.ndarray): Waveform. Shape :math:`[T_wav,]` + + Returns: + np.ndarray: Spectrogram. Shape :math:`[C, T_spec]`. :math:`T_spec == T_wav / hop_length` + """ + D = stft(y=wav, **kwargs) + S = np.abs(D) + return S.astype(np.float32) + + +def wav_to_mel(*, wav: np.ndarray = None, mel_basis=None, **kwargs) -> np.ndarray: + """Compute a melspectrogram from a waveform.""" + D = stft(y=wav, **kwargs) + S = spec_to_mel(spec=np.abs(D), mel_basis=mel_basis, **kwargs) + return S.astype(np.float32) + + +def spec_to_wav(*, spec: np.ndarray, power: float = 1.5, **kwargs) -> np.ndarray: + """Convert a spectrogram to a waveform using Griffi-Lim vocoder.""" + S = spec.copy() + return griffin_lim(spec=S**power, **kwargs) + + +def mel_to_wav(*, mel: np.ndarray = None, power: float = 1.5, **kwargs) -> np.ndarray: + """Convert a melspectrogram to a waveform using Griffi-Lim vocoder.""" + S = mel.copy() + S = mel_to_spec(mel=S, mel_basis=kwargs["mel_basis"]) # Convert back to linear + return griffin_lim(spec=S**power, **kwargs) + + +### STFT and ISTFT ### +def stft( + *, + y: np.ndarray = None, + fft_size: int = None, + hop_length: int = None, + win_length: int = None, + pad_mode: str = "reflect", + window: str = "hann", + center: bool = True, + **kwargs, +) -> np.ndarray: + """Librosa STFT wrapper. + + Check http://librosa.org/doc/main/generated/librosa.stft.html argument details. + + Returns: + np.ndarray: Complex number array. + """ + return librosa.stft( + y=y, + n_fft=fft_size, + hop_length=hop_length, + win_length=win_length, + pad_mode=pad_mode, + window=window, + center=center, + ) + + +def istft( + *, + y: np.ndarray = None, + hop_length: int = None, + win_length: int = None, + window: str = "hann", + center: bool = True, + **kwargs, +) -> np.ndarray: + """Librosa iSTFT wrapper. + + Check http://librosa.org/doc/main/generated/librosa.istft.html argument details. + + Returns: + np.ndarray: Complex number array. + """ + return librosa.istft(y, hop_length=hop_length, win_length=win_length, center=center, window=window) + + +def griffin_lim(*, spec: np.ndarray = None, num_iter=60, **kwargs) -> np.ndarray: + angles = np.exp(2j * np.pi * np.random.rand(*spec.shape)) + S_complex = np.abs(spec).astype(complex) + y = istft(y=S_complex * angles, **kwargs) + if not np.isfinite(y).all(): + print(" [!] Waveform is not finite everywhere. Skipping the GL.") + return np.array([0.0]) + for _ in range(num_iter): + angles = np.exp(1j * np.angle(stft(y=y, **kwargs))) + y = istft(y=S_complex * angles, **kwargs) + return y + + +def compute_stft_paddings( + *, x: np.ndarray = None, hop_length: int = None, pad_two_sides: bool = False, **kwargs +) -> Tuple[int, int]: + """Compute paddings used by Librosa's STFT. Compute right padding (final frame) or both sides padding + (first and final frames)""" + pad = (x.shape[0] // hop_length + 1) * hop_length - x.shape[0] + if not pad_two_sides: + return 0, pad + return pad // 2, pad // 2 + pad % 2 + + +def compute_f0( + *, + x: np.ndarray = None, + pitch_fmax: float = None, + pitch_fmin: float = None, + hop_length: int = None, + win_length: int = None, + sample_rate: int = None, + stft_pad_mode: str = "reflect", + center: bool = True, + **kwargs, +) -> np.ndarray: + """Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram. + + Args: + x (np.ndarray): Waveform. Shape :math:`[T_wav,]` + pitch_fmax (float): Pitch max value. + pitch_fmin (float): Pitch min value. + hop_length (int): Number of frames between STFT columns. + win_length (int): STFT window length. + sample_rate (int): Audio sampling rate. + stft_pad_mode (str): Padding mode for STFT. + center (bool): Centered padding. + + Returns: + np.ndarray: Pitch. Shape :math:`[T_pitch,]`. :math:`T_pitch == T_wav / hop_length` + + Examples: + >>> WAV_FILE = filename = librosa.example('vibeace') + >>> from TTS.config import BaseAudioConfig + >>> from TTS.utils.audio import AudioProcessor + >>> conf = BaseAudioConfig(pitch_fmax=640, pitch_fmin=1) + >>> ap = AudioProcessor(**conf) + >>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate] + >>> pitch = ap.compute_f0(wav) + """ + assert pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`." + assert pitch_fmin is not None, " [!] Set `pitch_fmin` before caling `compute_f0`." + + f0, voiced_mask, _ = pyin( + y=x.astype(np.double), + fmin=pitch_fmin, + fmax=pitch_fmax, + sr=sample_rate, + frame_length=win_length, + win_length=win_length // 2, + hop_length=hop_length, + pad_mode=stft_pad_mode, + center=center, + n_thresholds=100, + beta_parameters=(2, 18), + boltzmann_parameter=2, + resolution=0.1, + max_transition_rate=35.92, + switch_prob=0.01, + no_trough_prob=0.01, + ) + f0[~voiced_mask] = 0.0 + + return f0 + + +def compute_energy(y: np.ndarray, **kwargs) -> np.ndarray: + """Compute energy of a waveform using the same parameters used for computing melspectrogram. + Args: + x (np.ndarray): Waveform. Shape :math:`[T_wav,]` + Returns: + np.ndarray: energy. Shape :math:`[T_energy,]`. :math:`T_energy == T_wav / hop_length` + Examples: + >>> WAV_FILE = filename = librosa.example('vibeace') + >>> from TTS.config import BaseAudioConfig + >>> from TTS.utils.audio import AudioProcessor + >>> conf = BaseAudioConfig() + >>> ap = AudioProcessor(**conf) + >>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate] + >>> energy = ap.compute_energy(wav) + """ + x = stft(y=y, **kwargs) + mag, _ = magphase(x) + energy = np.sqrt(np.sum(mag**2, axis=0)) + return energy + + +### Audio Processing ### +def find_endpoint( + *, + wav: np.ndarray = None, + trim_db: float = -40, + sample_rate: int = None, + min_silence_sec=0.8, + gain: float = None, + base: int = None, + **kwargs, +) -> int: + """Find the last point without silence at the end of a audio signal. + + Args: + wav (np.ndarray): Audio signal. + threshold_db (int, optional): Silence threshold in decibels. Defaults to -40. + min_silence_sec (float, optional): Ignore silences that are shorter then this in secs. Defaults to 0.8. + gian (float, optional): Gain to be used to convert trim_db to trim_amp. Defaults to None. + base (int, optional): Base of the logarithm used to convert trim_db to trim_amp. Defaults to 10. + + Returns: + int: Last point without silence. + """ + window_length = int(sample_rate * min_silence_sec) + hop_length = int(window_length / 4) + threshold = db_to_amp(x=-trim_db, gain=gain, base=base) + for x in range(hop_length, len(wav) - window_length, hop_length): + if np.max(wav[x : x + window_length]) < threshold: + return x + hop_length + return len(wav) + + +def trim_silence( + *, + wav: np.ndarray = None, + sample_rate: int = None, + trim_db: float = None, + win_length: int = None, + hop_length: int = None, + **kwargs, +) -> np.ndarray: + """Trim silent parts with a threshold and 0.01 sec margin""" + margin = int(sample_rate * 0.01) + wav = wav[margin:-margin] + return librosa.effects.trim(wav, top_db=trim_db, frame_length=win_length, hop_length=hop_length)[0] + + +def volume_norm(*, x: np.ndarray = None, coef: float = 0.95, **kwargs) -> np.ndarray: + """Normalize the volume of an audio signal. + + Args: + x (np.ndarray): Raw waveform. + coef (float): Coefficient to rescale the maximum value. Defaults to 0.95. + + Returns: + np.ndarray: Volume normalized waveform. + """ + return x / abs(x).max() * coef + + +def rms_norm(*, wav: np.ndarray = None, db_level: float = -27.0, **kwargs) -> np.ndarray: + r = 10 ** (db_level / 20) + a = np.sqrt((len(wav) * (r**2)) / np.sum(wav**2)) + return wav * a + + +def rms_volume_norm(*, x: np.ndarray, db_level: float = -27.0, **kwargs) -> np.ndarray: + """Normalize the volume based on RMS of the signal. + + Args: + x (np.ndarray): Raw waveform. + db_level (float): Target dB level in RMS. Defaults to -27.0. + + Returns: + np.ndarray: RMS normalized waveform. + """ + assert -99 <= db_level <= 0, " [!] db_level should be between -99 and 0" + wav = rms_norm(wav=x, db_level=db_level) + return wav + + +def load_wav(*, filename: str, sample_rate: int = None, resample: bool = False, **kwargs) -> np.ndarray: + """Read a wav file using Librosa and optionally resample, silence trim, volume normalize. + + Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before. + + Args: + filename (str): Path to the wav file. + sr (int, optional): Sampling rate for resampling. Defaults to None. + resample (bool, optional): Resample the audio file when loading. Slows down the I/O time. Defaults to False. + + Returns: + np.ndarray: Loaded waveform. + """ + if resample: + # loading with resampling. It is significantly slower. + x, _ = librosa.load(filename, sr=sample_rate) + else: + # SF is faster than librosa for loading files + x, _ = sf.read(filename) + return x + + +def save_wav(*, wav: np.ndarray, path: str, sample_rate: int = None, pipe_out=None, **kwargs) -> None: + """Save float waveform to a file using Scipy. + + Args: + wav (np.ndarray): Waveform with float values in range [-1, 1] to save. + path (str): Path to a output file. + sr (int, optional): Sampling rate used for saving to the file. Defaults to None. + pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. + """ + wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) + + wav_norm = wav_norm.astype(np.int16) + if pipe_out: + wav_buffer = BytesIO() + scipy.io.wavfile.write(wav_buffer, sample_rate, wav_norm) + wav_buffer.seek(0) + pipe_out.buffer.write(wav_buffer.read()) + scipy.io.wavfile.write(path, sample_rate, wav_norm) + + +def mulaw_encode(*, wav: np.ndarray, mulaw_qc: int, **kwargs) -> np.ndarray: + mu = 2**mulaw_qc - 1 + signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1.0 + mu) + signal = (signal + 1) / 2 * mu + 0.5 + return np.floor( + signal, + ) + + +def mulaw_decode(*, wav, mulaw_qc: int, **kwargs) -> np.ndarray: + """Recovers waveform from quantized values.""" + mu = 2**mulaw_qc - 1 + x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1) + return x + + +def encode_16bits(*, x: np.ndarray, **kwargs) -> np.ndarray: + return np.clip(x * 2**15, -(2**15), 2**15 - 1).astype(np.int16) + + +def quantize(*, x: np.ndarray, quantize_bits: int, **kwargs) -> np.ndarray: + """Quantize a waveform to a given number of bits. + + Args: + x (np.ndarray): Waveform to quantize. Must be normalized into the range `[-1, 1]`. + quantize_bits (int): Number of quantization bits. + + Returns: + np.ndarray: Quantized waveform. + """ + return (x + 1.0) * (2**quantize_bits - 1) / 2 + + +def dequantize(*, x, quantize_bits, **kwargs) -> np.ndarray: + """Dequantize a waveform from the given number of bits.""" + return 2 * x / (2**quantize_bits - 1) - 1 diff --git a/TTS/utils/audio/processor.py b/TTS/utils/audio/processor.py new file mode 100644 index 0000000..c53bad5 --- /dev/null +++ b/TTS/utils/audio/processor.py @@ -0,0 +1,633 @@ +from io import BytesIO +from typing import Dict, Tuple + +import librosa +import numpy as np +import scipy.io.wavfile +import scipy.signal + +from TTS.tts.utils.helpers import StandardScaler +from TTS.utils.audio.numpy_transforms import ( + amp_to_db, + build_mel_basis, + compute_f0, + db_to_amp, + deemphasis, + find_endpoint, + griffin_lim, + load_wav, + mel_to_spec, + millisec_to_length, + preemphasis, + rms_volume_norm, + spec_to_mel, + stft, + trim_silence, + volume_norm, +) + +# pylint: disable=too-many-public-methods + + +class AudioProcessor(object): + """Audio Processor for TTS. + + Note: + All the class arguments are set to default values to enable a flexible initialization + of the class with the model config. They are not meaningful for all the arguments. + + Args: + sample_rate (int, optional): + target audio sampling rate. Defaults to None. + + resample (bool, optional): + enable/disable resampling of the audio clips when the target sampling rate does not match the original sampling rate. Defaults to False. + + num_mels (int, optional): + number of melspectrogram dimensions. Defaults to None. + + log_func (int, optional): + log exponent used for converting spectrogram aplitude to DB. + + min_level_db (int, optional): + minimum db threshold for the computed melspectrograms. Defaults to None. + + frame_shift_ms (int, optional): + milliseconds of frames between STFT columns. Defaults to None. + + frame_length_ms (int, optional): + milliseconds of STFT window length. Defaults to None. + + hop_length (int, optional): + number of frames between STFT columns. Used if ```frame_shift_ms``` is None. Defaults to None. + + win_length (int, optional): + STFT window length. Used if ```frame_length_ms``` is None. Defaults to None. + + ref_level_db (int, optional): + reference DB level to avoid background noise. In general <20DB corresponds to the air noise. Defaults to None. + + fft_size (int, optional): + FFT window size for STFT. Defaults to 1024. + + power (int, optional): + Exponent value applied to the spectrogram before GriffinLim. Defaults to None. + + preemphasis (float, optional): + Preemphasis coefficient. Preemphasis is disabled if == 0.0. Defaults to 0.0. + + signal_norm (bool, optional): + enable/disable signal normalization. Defaults to None. + + symmetric_norm (bool, optional): + enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else [0, k], Defaults to None. + + max_norm (float, optional): + ```k``` defining the normalization range. Defaults to None. + + mel_fmin (int, optional): + minimum filter frequency for computing melspectrograms. Defaults to None. + + mel_fmax (int, optional): + maximum filter frequency for computing melspectrograms. Defaults to None. + + pitch_fmin (int, optional): + minimum filter frequency for computing pitch. Defaults to None. + + pitch_fmax (int, optional): + maximum filter frequency for computing pitch. Defaults to None. + + spec_gain (int, optional): + gain applied when converting amplitude to DB. Defaults to 20. + + stft_pad_mode (str, optional): + Padding mode for STFT. Defaults to 'reflect'. + + clip_norm (bool, optional): + enable/disable clipping the our of range values in the normalized audio signal. Defaults to True. + + griffin_lim_iters (int, optional): + Number of GriffinLim iterations. Defaults to None. + + do_trim_silence (bool, optional): + enable/disable silence trimming when loading the audio signal. Defaults to False. + + trim_db (int, optional): + DB threshold used for silence trimming. Defaults to 60. + + do_sound_norm (bool, optional): + enable/disable signal normalization. Defaults to False. + + do_amp_to_db_linear (bool, optional): + enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True. + + do_amp_to_db_mel (bool, optional): + enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True. + + do_rms_norm (bool, optional): + enable/disable RMS volume normalization when loading an audio file. Defaults to False. + + db_level (int, optional): + dB level used for rms normalization. The range is -99 to 0. Defaults to None. + + stats_path (str, optional): + Path to the computed stats file. Defaults to None. + + verbose (bool, optional): + enable/disable logging. Defaults to True. + + """ + + def __init__( + self, + sample_rate=None, + resample=False, + num_mels=None, + log_func="np.log10", + min_level_db=None, + frame_shift_ms=None, + frame_length_ms=None, + hop_length=None, + win_length=None, + ref_level_db=None, + fft_size=1024, + power=None, + preemphasis=0.0, + signal_norm=None, + symmetric_norm=None, + max_norm=None, + mel_fmin=None, + mel_fmax=None, + pitch_fmax=None, + pitch_fmin=None, + spec_gain=20, + stft_pad_mode="reflect", + clip_norm=True, + griffin_lim_iters=None, + do_trim_silence=False, + trim_db=60, + do_sound_norm=False, + do_amp_to_db_linear=True, + do_amp_to_db_mel=True, + do_rms_norm=False, + db_level=None, + stats_path=None, + verbose=True, + **_, + ): + # setup class attributed + self.sample_rate = sample_rate + self.resample = resample + self.num_mels = num_mels + self.log_func = log_func + self.min_level_db = min_level_db or 0 + self.frame_shift_ms = frame_shift_ms + self.frame_length_ms = frame_length_ms + self.ref_level_db = ref_level_db + self.fft_size = fft_size + self.power = power + self.preemphasis = preemphasis + self.griffin_lim_iters = griffin_lim_iters + self.signal_norm = signal_norm + self.symmetric_norm = symmetric_norm + self.mel_fmin = mel_fmin or 0 + self.mel_fmax = mel_fmax + self.pitch_fmin = pitch_fmin + self.pitch_fmax = pitch_fmax + self.spec_gain = float(spec_gain) + self.stft_pad_mode = stft_pad_mode + self.max_norm = 1.0 if max_norm is None else float(max_norm) + self.clip_norm = clip_norm + self.do_trim_silence = do_trim_silence + self.trim_db = trim_db + self.do_sound_norm = do_sound_norm + self.do_amp_to_db_linear = do_amp_to_db_linear + self.do_amp_to_db_mel = do_amp_to_db_mel + self.do_rms_norm = do_rms_norm + self.db_level = db_level + self.stats_path = stats_path + # setup exp_func for db to amp conversion + if log_func == "np.log": + self.base = np.e + elif log_func == "np.log10": + self.base = 10 + else: + raise ValueError(" [!] unknown `log_func` value.") + # setup stft parameters + if hop_length is None: + # compute stft parameters from given time values + self.win_length, self.hop_length = millisec_to_length( + frame_length_ms=self.frame_length_ms, frame_shift_ms=self.frame_shift_ms, sample_rate=self.sample_rate + ) + else: + # use stft parameters from config file + self.hop_length = hop_length + self.win_length = win_length + assert min_level_db != 0.0, " [!] min_level_db is 0" + assert ( + self.win_length <= self.fft_size + ), f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}" + members = vars(self) + if verbose: + print(" > Setting up Audio Processor...") + for key, value in members.items(): + print(" | > {}:{}".format(key, value)) + # create spectrogram utils + self.mel_basis = build_mel_basis( + sample_rate=self.sample_rate, + fft_size=self.fft_size, + num_mels=self.num_mels, + mel_fmax=self.mel_fmax, + mel_fmin=self.mel_fmin, + ) + # setup scaler + if stats_path and signal_norm: + mel_mean, mel_std, linear_mean, linear_std, _ = self.load_stats(stats_path) + self.setup_scaler(mel_mean, mel_std, linear_mean, linear_std) + self.signal_norm = True + self.max_norm = None + self.clip_norm = None + self.symmetric_norm = None + + @staticmethod + def init_from_config(config: "Coqpit", verbose=True): + if "audio" in config: + return AudioProcessor(verbose=verbose, **config.audio) + return AudioProcessor(verbose=verbose, **config) + + ### normalization ### + def normalize(self, S: np.ndarray) -> np.ndarray: + """Normalize values into `[0, self.max_norm]` or `[-self.max_norm, self.max_norm]` + + Args: + S (np.ndarray): Spectrogram to normalize. + + Raises: + RuntimeError: Mean and variance is computed from incompatible parameters. + + Returns: + np.ndarray: Normalized spectrogram. + """ + # pylint: disable=no-else-return + S = S.copy() + if self.signal_norm: + # mean-var scaling + if hasattr(self, "mel_scaler"): + if S.shape[0] == self.num_mels: + return self.mel_scaler.transform(S.T).T + elif S.shape[0] == self.fft_size / 2: + return self.linear_scaler.transform(S.T).T + else: + raise RuntimeError(" [!] Mean-Var stats does not match the given feature dimensions.") + # range normalization + S -= self.ref_level_db # discard certain range of DB assuming it is air noise + S_norm = (S - self.min_level_db) / (-self.min_level_db) + if self.symmetric_norm: + S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm + if self.clip_norm: + S_norm = np.clip( + S_norm, -self.max_norm, self.max_norm # pylint: disable=invalid-unary-operand-type + ) + return S_norm + else: + S_norm = self.max_norm * S_norm + if self.clip_norm: + S_norm = np.clip(S_norm, 0, self.max_norm) + return S_norm + else: + return S + + def denormalize(self, S: np.ndarray) -> np.ndarray: + """Denormalize spectrogram values. + + Args: + S (np.ndarray): Spectrogram to denormalize. + + Raises: + RuntimeError: Mean and variance are incompatible. + + Returns: + np.ndarray: Denormalized spectrogram. + """ + # pylint: disable=no-else-return + S_denorm = S.copy() + if self.signal_norm: + # mean-var scaling + if hasattr(self, "mel_scaler"): + if S_denorm.shape[0] == self.num_mels: + return self.mel_scaler.inverse_transform(S_denorm.T).T + elif S_denorm.shape[0] == self.fft_size / 2: + return self.linear_scaler.inverse_transform(S_denorm.T).T + else: + raise RuntimeError(" [!] Mean-Var stats does not match the given feature dimensions.") + if self.symmetric_norm: + if self.clip_norm: + S_denorm = np.clip( + S_denorm, -self.max_norm, self.max_norm # pylint: disable=invalid-unary-operand-type + ) + S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db + return S_denorm + self.ref_level_db + else: + if self.clip_norm: + S_denorm = np.clip(S_denorm, 0, self.max_norm) + S_denorm = (S_denorm * -self.min_level_db / self.max_norm) + self.min_level_db + return S_denorm + self.ref_level_db + else: + return S_denorm + + ### Mean-STD scaling ### + def load_stats(self, stats_path: str) -> Tuple[np.array, np.array, np.array, np.array, Dict]: + """Loading mean and variance statistics from a `npy` file. + + Args: + stats_path (str): Path to the `npy` file containing + + Returns: + Tuple[np.array, np.array, np.array, np.array, Dict]: loaded statistics and the config used to + compute them. + """ + stats = np.load(stats_path, allow_pickle=True).item() # pylint: disable=unexpected-keyword-arg + mel_mean = stats["mel_mean"] + mel_std = stats["mel_std"] + linear_mean = stats["linear_mean"] + linear_std = stats["linear_std"] + stats_config = stats["audio_config"] + # check all audio parameters used for computing stats + skip_parameters = ["griffin_lim_iters", "stats_path", "do_trim_silence", "ref_level_db", "power"] + for key in stats_config.keys(): + if key in skip_parameters: + continue + if key not in ["sample_rate", "trim_db"]: + assert ( + stats_config[key] == self.__dict__[key] + ), f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}" + return mel_mean, mel_std, linear_mean, linear_std, stats_config + + # pylint: disable=attribute-defined-outside-init + def setup_scaler( + self, mel_mean: np.ndarray, mel_std: np.ndarray, linear_mean: np.ndarray, linear_std: np.ndarray + ) -> None: + """Initialize scaler objects used in mean-std normalization. + + Args: + mel_mean (np.ndarray): Mean for melspectrograms. + mel_std (np.ndarray): STD for melspectrograms. + linear_mean (np.ndarray): Mean for full scale spectrograms. + linear_std (np.ndarray): STD for full scale spectrograms. + """ + self.mel_scaler = StandardScaler() + self.mel_scaler.set_stats(mel_mean, mel_std) + self.linear_scaler = StandardScaler() + self.linear_scaler.set_stats(linear_mean, linear_std) + + ### Preemphasis ### + def apply_preemphasis(self, x: np.ndarray) -> np.ndarray: + """Apply pre-emphasis to the audio signal. Useful to reduce the correlation between neighbouring signal values. + + Args: + x (np.ndarray): Audio signal. + + Raises: + RuntimeError: Preemphasis coeff is set to 0. + + Returns: + np.ndarray: Decorrelated audio signal. + """ + return preemphasis(x=x, coef=self.preemphasis) + + def apply_inv_preemphasis(self, x: np.ndarray) -> np.ndarray: + """Reverse pre-emphasis.""" + return deemphasis(x=x, coef=self.preemphasis) + + ### SPECTROGRAMs ### + def spectrogram(self, y: np.ndarray) -> np.ndarray: + """Compute a spectrogram from a waveform. + + Args: + y (np.ndarray): Waveform. + + Returns: + np.ndarray: Spectrogram. + """ + if self.preemphasis != 0: + y = self.apply_preemphasis(y) + D = stft( + y=y, + fft_size=self.fft_size, + hop_length=self.hop_length, + win_length=self.win_length, + pad_mode=self.stft_pad_mode, + ) + if self.do_amp_to_db_linear: + S = amp_to_db(x=np.abs(D), gain=self.spec_gain, base=self.base) + else: + S = np.abs(D) + return self.normalize(S).astype(np.float32) + + def melspectrogram(self, y: np.ndarray) -> np.ndarray: + """Compute a melspectrogram from a waveform.""" + if self.preemphasis != 0: + y = self.apply_preemphasis(y) + D = stft( + y=y, + fft_size=self.fft_size, + hop_length=self.hop_length, + win_length=self.win_length, + pad_mode=self.stft_pad_mode, + ) + S = spec_to_mel(spec=np.abs(D), mel_basis=self.mel_basis) + if self.do_amp_to_db_mel: + S = amp_to_db(x=S, gain=self.spec_gain, base=self.base) + + return self.normalize(S).astype(np.float32) + + def inv_spectrogram(self, spectrogram: np.ndarray) -> np.ndarray: + """Convert a spectrogram to a waveform using Griffi-Lim vocoder.""" + S = self.denormalize(spectrogram) + S = db_to_amp(x=S, gain=self.spec_gain, base=self.base) + # Reconstruct phase + W = self._griffin_lim(S**self.power) + return self.apply_inv_preemphasis(W) if self.preemphasis != 0 else W + + def inv_melspectrogram(self, mel_spectrogram: np.ndarray) -> np.ndarray: + """Convert a melspectrogram to a waveform using Griffi-Lim vocoder.""" + D = self.denormalize(mel_spectrogram) + S = db_to_amp(x=D, gain=self.spec_gain, base=self.base) + S = mel_to_spec(mel=S, mel_basis=self.mel_basis) # Convert back to linear + W = self._griffin_lim(S**self.power) + return self.apply_inv_preemphasis(W) if self.preemphasis != 0 else W + + def out_linear_to_mel(self, linear_spec: np.ndarray) -> np.ndarray: + """Convert a full scale linear spectrogram output of a network to a melspectrogram. + + Args: + linear_spec (np.ndarray): Normalized full scale linear spectrogram. + + Returns: + np.ndarray: Normalized melspectrogram. + """ + S = self.denormalize(linear_spec) + S = db_to_amp(x=S, gain=self.spec_gain, base=self.base) + S = spec_to_mel(spec=np.abs(S), mel_basis=self.mel_basis) + S = amp_to_db(x=S, gain=self.spec_gain, base=self.base) + mel = self.normalize(S) + return mel + + def _griffin_lim(self, S): + return griffin_lim( + spec=S, + num_iter=self.griffin_lim_iters, + hop_length=self.hop_length, + win_length=self.win_length, + fft_size=self.fft_size, + pad_mode=self.stft_pad_mode, + ) + + def compute_f0(self, x: np.ndarray) -> np.ndarray: + """Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram. + + Args: + x (np.ndarray): Waveform. + + Returns: + np.ndarray: Pitch. + + Examples: + >>> WAV_FILE = filename = librosa.example('vibeace') + >>> from TTS.config import BaseAudioConfig + >>> from TTS.utils.audio import AudioProcessor + >>> conf = BaseAudioConfig(pitch_fmax=640, pitch_fmin=1) + >>> ap = AudioProcessor(**conf) + >>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate] + >>> pitch = ap.compute_f0(wav) + """ + # align F0 length to the spectrogram length + if len(x) % self.hop_length == 0: + x = np.pad(x, (0, self.hop_length // 2), mode=self.stft_pad_mode) + + f0 = compute_f0( + x=x, + pitch_fmax=self.pitch_fmax, + pitch_fmin=self.pitch_fmin, + hop_length=self.hop_length, + win_length=self.win_length, + sample_rate=self.sample_rate, + stft_pad_mode=self.stft_pad_mode, + center=True, + ) + + return f0 + + ### Audio Processing ### + def find_endpoint(self, wav: np.ndarray, min_silence_sec=0.8) -> int: + """Find the last point without silence at the end of a audio signal. + + Args: + wav (np.ndarray): Audio signal. + threshold_db (int, optional): Silence threshold in decibels. Defaults to -40. + min_silence_sec (float, optional): Ignore silences that are shorter then this in secs. Defaults to 0.8. + + Returns: + int: Last point without silence. + """ + return find_endpoint( + wav=wav, + trim_db=self.trim_db, + sample_rate=self.sample_rate, + min_silence_sec=min_silence_sec, + gain=self.spec_gain, + base=self.base, + ) + + def trim_silence(self, wav): + """Trim silent parts with a threshold and 0.01 sec margin""" + return trim_silence( + wav=wav, + sample_rate=self.sample_rate, + trim_db=self.trim_db, + win_length=self.win_length, + hop_length=self.hop_length, + ) + + @staticmethod + def sound_norm(x: np.ndarray) -> np.ndarray: + """Normalize the volume of an audio signal. + + Args: + x (np.ndarray): Raw waveform. + + Returns: + np.ndarray: Volume normalized waveform. + """ + return volume_norm(x=x) + + def rms_volume_norm(self, x: np.ndarray, db_level: float = None) -> np.ndarray: + """Normalize the volume based on RMS of the signal. + + Args: + x (np.ndarray): Raw waveform. + + Returns: + np.ndarray: RMS normalized waveform. + """ + if db_level is None: + db_level = self.db_level + return rms_volume_norm(x=x, db_level=db_level) + + ### save and load ### + def load_wav(self, filename: str, sr: int = None) -> np.ndarray: + """Read a wav file using Librosa and optionally resample, silence trim, volume normalize. + + Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before. + + Args: + filename (str): Path to the wav file. + sr (int, optional): Sampling rate for resampling. Defaults to None. + + Returns: + np.ndarray: Loaded waveform. + """ + if sr is not None: + x = load_wav(filename=filename, sample_rate=sr, resample=True) + else: + x = load_wav(filename=filename, sample_rate=self.sample_rate, resample=self.resample) + if self.do_trim_silence: + try: + x = self.trim_silence(x) + except ValueError: + print(f" [!] File cannot be trimmed for silence - {filename}") + if self.do_sound_norm: + x = self.sound_norm(x) + if self.do_rms_norm: + x = self.rms_volume_norm(x, self.db_level) + return x + + def save_wav(self, wav: np.ndarray, path: str, sr: int = None, pipe_out=None) -> None: + """Save a waveform to a file using Scipy. + + Args: + wav (np.ndarray): Waveform to save. + path (str): Path to a output file. + sr (int, optional): Sampling rate used for saving to the file. Defaults to None. + pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. + """ + if self.do_rms_norm: + wav_norm = self.rms_volume_norm(wav, self.db_level) * 32767 + else: + wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) + + wav_norm = wav_norm.astype(np.int16) + if pipe_out: + wav_buffer = BytesIO() + scipy.io.wavfile.write(wav_buffer, sr if sr else self.sample_rate, wav_norm) + wav_buffer.seek(0) + pipe_out.buffer.write(wav_buffer.read()) + scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm) + + def get_duration(self, filename: str) -> float: + """Get the duration of a wav file using Librosa. + + Args: + filename (str): Path to the wav file. + """ + return librosa.get_duration(filename=filename) diff --git a/TTS/utils/audio/torch_transforms.py b/TTS/utils/audio/torch_transforms.py new file mode 100644 index 0000000..fd40ebb --- /dev/null +++ b/TTS/utils/audio/torch_transforms.py @@ -0,0 +1,165 @@ +import librosa +import torch +from torch import nn + + +class TorchSTFT(nn.Module): # pylint: disable=abstract-method + """Some of the audio processing funtions using Torch for faster batch processing. + + Args: + + n_fft (int): + FFT window size for STFT. + + hop_length (int): + number of frames between STFT columns. + + win_length (int, optional): + STFT window length. + + pad_wav (bool, optional): + If True pad the audio with (n_fft - hop_length) / 2). Defaults to False. + + window (str, optional): + The name of a function to create a window tensor that is applied/multiplied to each frame/window. Defaults to "hann_window" + + sample_rate (int, optional): + target audio sampling rate. Defaults to None. + + mel_fmin (int, optional): + minimum filter frequency for computing melspectrograms. Defaults to None. + + mel_fmax (int, optional): + maximum filter frequency for computing melspectrograms. Defaults to None. + + n_mels (int, optional): + number of melspectrogram dimensions. Defaults to None. + + use_mel (bool, optional): + If True compute the melspectrograms otherwise. Defaults to False. + + do_amp_to_db_linear (bool, optional): + enable/disable amplitude to dB conversion of linear spectrograms. Defaults to False. + + spec_gain (float, optional): + gain applied when converting amplitude to DB. Defaults to 1.0. + + power (float, optional): + Exponent for the magnitude spectrogram, e.g., 1 for energy, 2 for power, etc. Defaults to None. + + use_htk (bool, optional): + Use HTK formula in mel filter instead of Slaney. + + mel_norm (None, 'slaney', or number, optional): + If 'slaney', divide the triangular mel weights by the width of the mel band + (area normalization). + + If numeric, use `librosa.util.normalize` to normalize each filter by to unit l_p norm. + See `librosa.util.normalize` for a full description of supported norm values + (including `+-np.inf`). + + Otherwise, leave all the triangles aiming for a peak value of 1.0. Defaults to "slaney". + """ + + def __init__( + self, + n_fft, + hop_length, + win_length, + pad_wav=False, + window="hann_window", + sample_rate=None, + mel_fmin=0, + mel_fmax=None, + n_mels=80, + use_mel=False, + do_amp_to_db=False, + spec_gain=1.0, + power=None, + use_htk=False, + mel_norm="slaney", + normalized=False, + ): + super().__init__() + self.n_fft = n_fft + self.hop_length = hop_length + self.win_length = win_length + self.pad_wav = pad_wav + self.sample_rate = sample_rate + self.mel_fmin = mel_fmin + self.mel_fmax = mel_fmax + self.n_mels = n_mels + self.use_mel = use_mel + self.do_amp_to_db = do_amp_to_db + self.spec_gain = spec_gain + self.power = power + self.use_htk = use_htk + self.mel_norm = mel_norm + self.window = nn.Parameter(getattr(torch, window)(win_length), requires_grad=False) + self.mel_basis = None + self.normalized = normalized + if use_mel: + self._build_mel_basis() + + def __call__(self, x): + """Compute spectrogram frames by torch based stft. + + Args: + x (Tensor): input waveform + + Returns: + Tensor: spectrogram frames. + + Shapes: + x: [B x T] or [:math:`[B, 1, T]`] + """ + if x.ndim == 2: + x = x.unsqueeze(1) + if self.pad_wav: + padding = int((self.n_fft - self.hop_length) / 2) + x = torch.nn.functional.pad(x, (padding, padding), mode="reflect") + # B x D x T x 2 + o = torch.stft( + x.squeeze(1), + self.n_fft, + self.hop_length, + self.win_length, + self.window, + center=True, + pad_mode="reflect", # compatible with audio.py + normalized=self.normalized, + onesided=True, + return_complex=False, + ) + M = o[:, :, :, 0] + P = o[:, :, :, 1] + S = torch.sqrt(torch.clamp(M**2 + P**2, min=1e-8)) + + if self.power is not None: + S = S**self.power + + if self.use_mel: + S = torch.matmul(self.mel_basis.to(x), S) + if self.do_amp_to_db: + S = self._amp_to_db(S, spec_gain=self.spec_gain) + return S + + def _build_mel_basis(self): + mel_basis = librosa.filters.mel( + sr=self.sample_rate, + n_fft=self.n_fft, + n_mels=self.n_mels, + fmin=self.mel_fmin, + fmax=self.mel_fmax, + htk=self.use_htk, + norm=self.mel_norm, + ) + self.mel_basis = torch.from_numpy(mel_basis).float() + + @staticmethod + def _amp_to_db(x, spec_gain=1.0): + return torch.log(torch.clamp(x, min=1e-5) * spec_gain) + + @staticmethod + def _db_to_amp(x, spec_gain=1.0): + return torch.exp(x) / spec_gain diff --git a/TTS/utils/callbacks.py b/TTS/utils/callbacks.py new file mode 100644 index 0000000..511d215 --- /dev/null +++ b/TTS/utils/callbacks.py @@ -0,0 +1,105 @@ +class TrainerCallback: + @staticmethod + def on_init_start(trainer) -> None: + if hasattr(trainer.model, "module"): + if hasattr(trainer.model.module, "on_init_start"): + trainer.model.module.on_init_start(trainer) + else: + if hasattr(trainer.model, "on_init_start"): + trainer.model.on_init_start(trainer) + + if hasattr(trainer.criterion, "on_init_start"): + trainer.criterion.on_init_start(trainer) + + if hasattr(trainer.optimizer, "on_init_start"): + trainer.optimizer.on_init_start(trainer) + + @staticmethod + def on_init_end(trainer) -> None: + if hasattr(trainer.model, "module"): + if hasattr(trainer.model.module, "on_init_end"): + trainer.model.module.on_init_end(trainer) + else: + if hasattr(trainer.model, "on_init_end"): + trainer.model.on_init_end(trainer) + + if hasattr(trainer.criterion, "on_init_end"): + trainer.criterion.on_init_end(trainer) + + if hasattr(trainer.optimizer, "on_init_end"): + trainer.optimizer.on_init_end(trainer) + + @staticmethod + def on_epoch_start(trainer) -> None: + if hasattr(trainer.model, "module"): + if hasattr(trainer.model.module, "on_epoch_start"): + trainer.model.module.on_epoch_start(trainer) + else: + if hasattr(trainer.model, "on_epoch_start"): + trainer.model.on_epoch_start(trainer) + + if hasattr(trainer.criterion, "on_epoch_start"): + trainer.criterion.on_epoch_start(trainer) + + if hasattr(trainer.optimizer, "on_epoch_start"): + trainer.optimizer.on_epoch_start(trainer) + + @staticmethod + def on_epoch_end(trainer) -> None: + if hasattr(trainer.model, "module"): + if hasattr(trainer.model.module, "on_epoch_end"): + trainer.model.module.on_epoch_end(trainer) + else: + if hasattr(trainer.model, "on_epoch_end"): + trainer.model.on_epoch_end(trainer) + + if hasattr(trainer.criterion, "on_epoch_end"): + trainer.criterion.on_epoch_end(trainer) + + if hasattr(trainer.optimizer, "on_epoch_end"): + trainer.optimizer.on_epoch_end(trainer) + + @staticmethod + def on_train_step_start(trainer) -> None: + if hasattr(trainer.model, "module"): + if hasattr(trainer.model.module, "on_train_step_start"): + trainer.model.module.on_train_step_start(trainer) + else: + if hasattr(trainer.model, "on_train_step_start"): + trainer.model.on_train_step_start(trainer) + + if hasattr(trainer.criterion, "on_train_step_start"): + trainer.criterion.on_train_step_start(trainer) + + if hasattr(trainer.optimizer, "on_train_step_start"): + trainer.optimizer.on_train_step_start(trainer) + + @staticmethod + def on_train_step_end(trainer) -> None: + if hasattr(trainer.model, "module"): + if hasattr(trainer.model.module, "on_train_step_end"): + trainer.model.module.on_train_step_end(trainer) + else: + if hasattr(trainer.model, "on_train_step_end"): + trainer.model.on_train_step_end(trainer) + + if hasattr(trainer.criterion, "on_train_step_end"): + trainer.criterion.on_train_step_end(trainer) + + if hasattr(trainer.optimizer, "on_train_step_end"): + trainer.optimizer.on_train_step_end(trainer) + + @staticmethod + def on_keyboard_interrupt(trainer) -> None: + if hasattr(trainer.model, "module"): + if hasattr(trainer.model.module, "on_keyboard_interrupt"): + trainer.model.module.on_keyboard_interrupt(trainer) + else: + if hasattr(trainer.model, "on_keyboard_interrupt"): + trainer.model.on_keyboard_interrupt(trainer) + + if hasattr(trainer.criterion, "on_keyboard_interrupt"): + trainer.criterion.on_keyboard_interrupt(trainer) + + if hasattr(trainer.optimizer, "on_keyboard_interrupt"): + trainer.optimizer.on_keyboard_interrupt(trainer) diff --git a/TTS/utils/capacitron_optimizer.py b/TTS/utils/capacitron_optimizer.py new file mode 100644 index 0000000..7206ffd --- /dev/null +++ b/TTS/utils/capacitron_optimizer.py @@ -0,0 +1,67 @@ +from typing import Generator + +from trainer.trainer_utils import get_optimizer + + +class CapacitronOptimizer: + """Double optimizer class for the Capacitron model.""" + + def __init__(self, config: dict, model_params: Generator) -> None: + self.primary_params, self.secondary_params = self.split_model_parameters(model_params) + + optimizer_names = list(config.optimizer_params.keys()) + optimizer_parameters = list(config.optimizer_params.values()) + + self.primary_optimizer = get_optimizer( + optimizer_names[0], + optimizer_parameters[0], + config.lr, + parameters=self.primary_params, + ) + + self.secondary_optimizer = get_optimizer( + optimizer_names[1], + self.extract_optimizer_parameters(optimizer_parameters[1]), + optimizer_parameters[1]["lr"], + parameters=self.secondary_params, + ) + + self.param_groups = self.primary_optimizer.param_groups + + def first_step(self): + self.secondary_optimizer.step() + self.secondary_optimizer.zero_grad() + self.primary_optimizer.zero_grad() + + def step(self): + # Update param groups to display the correct learning rate + self.param_groups = self.primary_optimizer.param_groups + self.primary_optimizer.step() + + def zero_grad(self, set_to_none=False): + self.primary_optimizer.zero_grad(set_to_none) + self.secondary_optimizer.zero_grad(set_to_none) + + def load_state_dict(self, state_dict): + self.primary_optimizer.load_state_dict(state_dict[0]) + self.secondary_optimizer.load_state_dict(state_dict[1]) + + def state_dict(self): + return [self.primary_optimizer.state_dict(), self.secondary_optimizer.state_dict()] + + @staticmethod + def split_model_parameters(model_params: Generator) -> list: + primary_params = [] + secondary_params = [] + for name, param in model_params: + if param.requires_grad: + if name == "capacitron_vae_layer.beta": + secondary_params.append(param) + else: + primary_params.append(param) + return [iter(primary_params), iter(secondary_params)] + + @staticmethod + def extract_optimizer_parameters(params: dict) -> dict: + """Extract parameters that are not the learning rate""" + return {k: v for k, v in params.items() if k != "lr"} diff --git a/TTS/utils/distribute.py b/TTS/utils/distribute.py new file mode 100644 index 0000000..a51ef76 --- /dev/null +++ b/TTS/utils/distribute.py @@ -0,0 +1,20 @@ +# edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py +import torch +import torch.distributed as dist + + +def reduce_tensor(tensor, num_gpus): + rt = tensor.clone() + dist.all_reduce(rt, op=dist.reduce_op.SUM) + rt /= num_gpus + return rt + + +def init_distributed(rank, num_gpus, group_name, dist_backend, dist_url): + assert torch.cuda.is_available(), "Distributed mode requires CUDA." + + # Set cuda device so everything is done on the right GPU. + torch.cuda.set_device(rank % torch.cuda.device_count()) + + # Initialize distributed communication + dist.init_process_group(dist_backend, init_method=dist_url, world_size=num_gpus, rank=rank, group_name=group_name) diff --git a/TTS/utils/download.py b/TTS/utils/download.py new file mode 100644 index 0000000..3f06b57 --- /dev/null +++ b/TTS/utils/download.py @@ -0,0 +1,206 @@ +# Adapted from https://github.com/pytorch/audio/ + +import hashlib +import logging +import os +import tarfile +import urllib +import urllib.request +import zipfile +from os.path import expanduser +from typing import Any, Iterable, List, Optional + +from torch.utils.model_zoo import tqdm + + +def stream_url( + url: str, start_byte: Optional[int] = None, block_size: int = 32 * 1024, progress_bar: bool = True +) -> Iterable: + """Stream url by chunk + + Args: + url (str): Url. + start_byte (int or None, optional): Start streaming at that point (Default: ``None``). + block_size (int, optional): Size of chunks to stream (Default: ``32 * 1024``). + progress_bar (bool, optional): Display a progress bar (Default: ``True``). + """ + + # If we already have the whole file, there is no need to download it again + req = urllib.request.Request(url, method="HEAD") + with urllib.request.urlopen(req) as response: + url_size = int(response.info().get("Content-Length", -1)) + if url_size == start_byte: + return + + req = urllib.request.Request(url) + if start_byte: + req.headers["Range"] = "bytes={}-".format(start_byte) + + with urllib.request.urlopen(req) as upointer, tqdm( + unit="B", + unit_scale=True, + unit_divisor=1024, + total=url_size, + disable=not progress_bar, + ) as pbar: + num_bytes = 0 + while True: + chunk = upointer.read(block_size) + if not chunk: + break + yield chunk + num_bytes += len(chunk) + pbar.update(len(chunk)) + + +def download_url( + url: str, + download_folder: str, + filename: Optional[str] = None, + hash_value: Optional[str] = None, + hash_type: str = "sha256", + progress_bar: bool = True, + resume: bool = False, +) -> None: + """Download file to disk. + + Args: + url (str): Url. + download_folder (str): Folder to download file. + filename (str or None, optional): Name of downloaded file. If None, it is inferred from the url + (Default: ``None``). + hash_value (str or None, optional): Hash for url (Default: ``None``). + hash_type (str, optional): Hash type, among "sha256" and "md5" (Default: ``"sha256"``). + progress_bar (bool, optional): Display a progress bar (Default: ``True``). + resume (bool, optional): Enable resuming download (Default: ``False``). + """ + + req = urllib.request.Request(url, method="HEAD") + req_info = urllib.request.urlopen(req).info() # pylint: disable=consider-using-with + + # Detect filename + filename = filename or req_info.get_filename() or os.path.basename(url) + filepath = os.path.join(download_folder, filename) + if resume and os.path.exists(filepath): + mode = "ab" + local_size: Optional[int] = os.path.getsize(filepath) + + elif not resume and os.path.exists(filepath): + raise RuntimeError("{} already exists. Delete the file manually and retry.".format(filepath)) + else: + mode = "wb" + local_size = None + + if hash_value and local_size == int(req_info.get("Content-Length", -1)): + with open(filepath, "rb") as file_obj: + if validate_file(file_obj, hash_value, hash_type): + return + raise RuntimeError("The hash of {} does not match. Delete the file manually and retry.".format(filepath)) + + with open(filepath, mode) as fpointer: + for chunk in stream_url(url, start_byte=local_size, progress_bar=progress_bar): + fpointer.write(chunk) + + with open(filepath, "rb") as file_obj: + if hash_value and not validate_file(file_obj, hash_value, hash_type): + raise RuntimeError("The hash of {} does not match. Delete the file manually and retry.".format(filepath)) + + +def validate_file(file_obj: Any, hash_value: str, hash_type: str = "sha256") -> bool: + """Validate a given file object with its hash. + + Args: + file_obj: File object to read from. + hash_value (str): Hash for url. + hash_type (str, optional): Hash type, among "sha256" and "md5" (Default: ``"sha256"``). + + Returns: + bool: return True if its a valid file, else False. + """ + + if hash_type == "sha256": + hash_func = hashlib.sha256() + elif hash_type == "md5": + hash_func = hashlib.md5() + else: + raise ValueError + + while True: + # Read by chunk to avoid filling memory + chunk = file_obj.read(1024**2) + if not chunk: + break + hash_func.update(chunk) + + return hash_func.hexdigest() == hash_value + + +def extract_archive(from_path: str, to_path: Optional[str] = None, overwrite: bool = False) -> List[str]: + """Extract archive. + Args: + from_path (str): the path of the archive. + to_path (str or None, optional): the root path of the extraced files (directory of from_path) + (Default: ``None``) + overwrite (bool, optional): overwrite existing files (Default: ``False``) + + Returns: + list: List of paths to extracted files even if not overwritten. + """ + + if to_path is None: + to_path = os.path.dirname(from_path) + + try: + with tarfile.open(from_path, "r") as tar: + logging.info("Opened tar file %s.", from_path) + files = [] + for file_ in tar: # type: Any + file_path = os.path.join(to_path, file_.name) + if file_.isfile(): + files.append(file_path) + if os.path.exists(file_path): + logging.info("%s already extracted.", file_path) + if not overwrite: + continue + tar.extract(file_, to_path) + return files + except tarfile.ReadError: + pass + + try: + with zipfile.ZipFile(from_path, "r") as zfile: + logging.info("Opened zip file %s.", from_path) + files = zfile.namelist() + for file_ in files: + file_path = os.path.join(to_path, file_) + if os.path.exists(file_path): + logging.info("%s already extracted.", file_path) + if not overwrite: + continue + zfile.extract(file_, to_path) + return files + except zipfile.BadZipFile: + pass + + raise NotImplementedError(" > [!] only supports tar.gz, tgz, and zip achives.") + + +def download_kaggle_dataset(dataset_path: str, dataset_name: str, output_path: str): + """Download dataset from kaggle. + Args: + dataset_path (str): + This the kaggle link to the dataset. for example vctk is 'mfekadu/english-multispeaker-corpus-for-voice-cloning' + dataset_name (str): Name of the folder the dataset will be saved in. + output_path (str): Path of the location you want the dataset folder to be saved to. + """ + data_path = os.path.join(output_path, dataset_name) + try: + import kaggle # pylint: disable=import-outside-toplevel + + kaggle.api.authenticate() + print(f"""\nDownloading {dataset_name}...""") + kaggle.api.dataset_download_files(dataset_path, path=data_path, unzip=True) + except OSError: + print( + f"""[!] in order to download kaggle datasets, you need to have a kaggle api token stored in your {os.path.join(expanduser('~'), '.kaggle/kaggle.json')}""" + ) diff --git a/TTS/utils/downloaders.py b/TTS/utils/downloaders.py new file mode 100644 index 0000000..104dc7b --- /dev/null +++ b/TTS/utils/downloaders.py @@ -0,0 +1,126 @@ +import os +from typing import Optional + +from TTS.utils.download import download_kaggle_dataset, download_url, extract_archive + + +def download_ljspeech(path: str): + """Download and extract LJSpeech dataset + + Args: + path (str): path to the directory where the dataset will be stored. + """ + os.makedirs(path, exist_ok=True) + url = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2" + download_url(url, path) + basename = os.path.basename(url) + archive = os.path.join(path, basename) + print(" > Extracting archive file...") + extract_archive(archive) + + +def download_vctk(path: str, use_kaggle: Optional[bool] = False): + """Download and extract VCTK dataset. + + Args: + path (str): path to the directory where the dataset will be stored. + + use_kaggle (bool, optional): Downloads vctk dataset from kaggle. Is generally faster. Defaults to False. + """ + if use_kaggle: + download_kaggle_dataset("mfekadu/english-multispeaker-corpus-for-voice-cloning", "VCTK", path) + else: + os.makedirs(path, exist_ok=True) + url = "https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip" + download_url(url, path) + basename = os.path.basename(url) + archive = os.path.join(path, basename) + print(" > Extracting archive file...") + extract_archive(archive) + + +def download_tweb(path: str): + """Download and extract Tweb dataset + + Args: + path (str): Path to the directory where the dataset will be stored. + """ + download_kaggle_dataset("bryanpark/the-world-english-bible-speech-dataset", "TWEB", path) + + +def download_libri_tts(path: str, subset: Optional[str] = "all"): + """Download and extract libri tts dataset. + + Args: + path (str): Path to the directory where the dataset will be stored. + + subset (str, optional): Name of the subset to download. If you only want to download a certain + portion specify it here. Defaults to 'all'. + """ + + subset_dict = { + "libri-tts-clean-100": "http://www.openslr.org/resources/60/train-clean-100.tar.gz", + "libri-tts-clean-360": "http://www.openslr.org/resources/60/train-clean-360.tar.gz", + "libri-tts-other-500": "http://www.openslr.org/resources/60/train-other-500.tar.gz", + "libri-tts-dev-clean": "http://www.openslr.org/resources/60/dev-clean.tar.gz", + "libri-tts-dev-other": "http://www.openslr.org/resources/60/dev-other.tar.gz", + "libri-tts-test-clean": "http://www.openslr.org/resources/60/test-clean.tar.gz", + "libri-tts-test-other": "http://www.openslr.org/resources/60/test-other.tar.gz", + } + + os.makedirs(path, exist_ok=True) + if subset == "all": + for sub, val in subset_dict.items(): + print(f" > Downloading {sub}...") + download_url(val, path) + basename = os.path.basename(val) + archive = os.path.join(path, basename) + print(" > Extracting archive file...") + extract_archive(archive) + print(" > All subsets downloaded") + else: + url = subset_dict[subset] + download_url(url, path) + basename = os.path.basename(url) + archive = os.path.join(path, basename) + print(" > Extracting archive file...") + extract_archive(archive) + + +def download_thorsten_de(path: str): + """Download and extract Thorsten german male voice dataset. + + Args: + path (str): Path to the directory where the dataset will be stored. + """ + os.makedirs(path, exist_ok=True) + url = "https://www.openslr.org/resources/95/thorsten-de_v02.tgz" + download_url(url, path) + basename = os.path.basename(url) + archive = os.path.join(path, basename) + print(" > Extracting archive file...") + extract_archive(archive) + + +def download_mailabs(path: str, language: str = "english"): + """Download and extract Mailabs dataset. + + Args: + path (str): Path to the directory where the dataset will be stored. + + language (str): Language subset to download. Defaults to english. + """ + language_dict = { + "english": "https://data.solak.de/data/Training/stt_tts/en_US.tgz", + "german": "https://data.solak.de/data/Training/stt_tts/de_DE.tgz", + "french": "https://data.solak.de/data/Training/stt_tts/fr_FR.tgz", + "italian": "https://data.solak.de/data/Training/stt_tts/it_IT.tgz", + "spanish": "https://data.solak.de/data/Training/stt_tts/es_ES.tgz", + } + os.makedirs(path, exist_ok=True) + url = language_dict[language] + download_url(url, path) + basename = os.path.basename(url) + archive = os.path.join(path, basename) + print(" > Extracting archive file...") + extract_archive(archive) diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py new file mode 100644 index 0000000..4fa4741 --- /dev/null +++ b/TTS/utils/generic_utils.py @@ -0,0 +1,239 @@ +# -*- coding: utf-8 -*- +import datetime +import importlib +import logging +import os +import re +import subprocess +import sys +from pathlib import Path +from typing import Dict + +import fsspec +import torch + + +def to_cuda(x: torch.Tensor) -> torch.Tensor: + if x is None: + return None + if torch.is_tensor(x): + x = x.contiguous() + if torch.cuda.is_available(): + x = x.cuda(non_blocking=True) + return x + + +def get_cuda(): + use_cuda = torch.cuda.is_available() + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + return use_cuda, device + + +def get_git_branch(): + try: + out = subprocess.check_output(["git", "branch"]).decode("utf8") + current = next(line for line in out.split("\n") if line.startswith("*")) + current.replace("* ", "") + except subprocess.CalledProcessError: + current = "inside_docker" + except (FileNotFoundError, StopIteration) as e: + current = "unknown" + return current + + +def get_commit_hash(): + """https://stackoverflow.com/questions/14989858/get-the-current-git-hash-in-a-python-script""" + # try: + # subprocess.check_output(['git', 'diff-index', '--quiet', + # 'HEAD']) # Verify client is clean + # except: + # raise RuntimeError( + # " !! Commit before training to get the commit hash.") + try: + commit = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]).decode().strip() + # Not copying .git folder into docker container + except (subprocess.CalledProcessError, FileNotFoundError): + commit = "0000000" + return commit + + +def get_experiment_folder_path(root_path, model_name): + """Get an experiment folder path with the current date and time""" + date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p") + commit_hash = get_commit_hash() + output_folder = os.path.join(root_path, model_name + "-" + date_str + "-" + commit_hash) + return output_folder + + +def remove_experiment_folder(experiment_path): + """Check folder if there is a checkpoint, otherwise remove the folder""" + fs = fsspec.get_mapper(experiment_path).fs + checkpoint_files = fs.glob(experiment_path + "/*.pth") + if not checkpoint_files: + if fs.exists(experiment_path): + fs.rm(experiment_path, recursive=True) + print(" ! Run is removed from {}".format(experiment_path)) + else: + print(" ! Run is kept in {}".format(experiment_path)) + + +def count_parameters(model): + r"""Count number of trainable parameters in a network""" + return sum(p.numel() for p in model.parameters() if p.requires_grad) + + +def to_camel(text): + text = text.capitalize() + text = re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) + text = text.replace("Tts", "TTS") + text = text.replace("vc", "VC") + return text + + +def find_module(module_path: str, module_name: str) -> object: + module_name = module_name.lower() + module = importlib.import_module(module_path + "." + module_name) + class_name = to_camel(module_name) + return getattr(module, class_name) + + +def import_class(module_path: str) -> object: + """Import a class from a module path. + + Args: + module_path (str): The module path of the class. + + Returns: + object: The imported class. + """ + class_name = module_path.split(".")[-1] + module_path = ".".join(module_path.split(".")[:-1]) + module = importlib.import_module(module_path) + return getattr(module, class_name) + + +def get_import_path(obj: object) -> str: + """Get the import path of a class. + + Args: + obj (object): The class object. + + Returns: + str: The import path of the class. + """ + return ".".join([type(obj).__module__, type(obj).__name__]) + + +def get_user_data_dir(appname): + TTS_HOME = os.environ.get("TTS_HOME") + XDG_DATA_HOME = os.environ.get("XDG_DATA_HOME") + if TTS_HOME is not None: + ans = Path(TTS_HOME).expanduser().resolve(strict=False) + elif XDG_DATA_HOME is not None: + ans = Path(XDG_DATA_HOME).expanduser().resolve(strict=False) + elif sys.platform == "win32": + import winreg # pylint: disable=import-outside-toplevel + + key = winreg.OpenKey( + winreg.HKEY_CURRENT_USER, r"Software\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders" + ) + dir_, _ = winreg.QueryValueEx(key, "Local AppData") + ans = Path(dir_).resolve(strict=False) + elif sys.platform == "darwin": + ans = Path("~/Library/Application Support/").expanduser() + else: + ans = Path.home().joinpath(".local/share") + return ans.joinpath(appname) + + +def set_init_dict(model_dict, checkpoint_state, c): + # Partial initialization: if there is a mismatch with new and old layer, it is skipped. + for k, v in checkpoint_state.items(): + if k not in model_dict: + print(" | > Layer missing in the model definition: {}".format(k)) + # 1. filter out unnecessary keys + pretrained_dict = {k: v for k, v in checkpoint_state.items() if k in model_dict} + # 2. filter out different size layers + pretrained_dict = {k: v for k, v in pretrained_dict.items() if v.numel() == model_dict[k].numel()} + # 3. skip reinit layers + if c.has("reinit_layers") and c.reinit_layers is not None: + for reinit_layer_name in c.reinit_layers: + pretrained_dict = {k: v for k, v in pretrained_dict.items() if reinit_layer_name not in k} + # 4. overwrite entries in the existing state dict + model_dict.update(pretrained_dict) + print(" | > {} / {} layers are restored.".format(len(pretrained_dict), len(model_dict))) + return model_dict + + +def format_aux_input(def_args: Dict, kwargs: Dict) -> Dict: + """Format kwargs to hande auxilary inputs to models. + + Args: + def_args (Dict): A dictionary of argument names and their default values if not defined in `kwargs`. + kwargs (Dict): A `dict` or `kwargs` that includes auxilary inputs to the model. + + Returns: + Dict: arguments with formatted auxilary inputs. + """ + kwargs = kwargs.copy() + for name in def_args: + if name not in kwargs or kwargs[name] is None: + kwargs[name] = def_args[name] + return kwargs + + +class KeepAverage: + def __init__(self): + self.avg_values = {} + self.iters = {} + + def __getitem__(self, key): + return self.avg_values[key] + + def items(self): + return self.avg_values.items() + + def add_value(self, name, init_val=0, init_iter=0): + self.avg_values[name] = init_val + self.iters[name] = init_iter + + def update_value(self, name, value, weighted_avg=False): + if name not in self.avg_values: + # add value if not exist before + self.add_value(name, init_val=value) + else: + # else update existing value + if weighted_avg: + self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value + self.iters[name] += 1 + else: + self.avg_values[name] = self.avg_values[name] * self.iters[name] + value + self.iters[name] += 1 + self.avg_values[name] /= self.iters[name] + + def add_values(self, name_dict): + for key, value in name_dict.items(): + self.add_value(key, init_val=value) + + def update_values(self, value_dict): + for key, value in value_dict.items(): + self.update_value(key, value) + + +def get_timestamp(): + return datetime.now().strftime("%y%m%d-%H%M%S") + + +def setup_logger(logger_name, root, phase, level=logging.INFO, screen=False, tofile=False): + lg = logging.getLogger(logger_name) + formatter = logging.Formatter("%(asctime)s.%(msecs)03d - %(levelname)s: %(message)s", datefmt="%y-%m-%d %H:%M:%S") + lg.setLevel(level) + if tofile: + log_file = os.path.join(root, phase + "_{}.log".format(get_timestamp())) + fh = logging.FileHandler(log_file, mode="w") + fh.setFormatter(formatter) + lg.addHandler(fh) + if screen: + sh = logging.StreamHandler() + sh.setFormatter(formatter) + lg.addHandler(sh) diff --git a/TTS/utils/io.py b/TTS/utils/io.py new file mode 100644 index 0000000..3107ba6 --- /dev/null +++ b/TTS/utils/io.py @@ -0,0 +1,70 @@ +import os +import pickle as pickle_tts +from typing import Any, Callable, Dict, Union + +import fsspec +import torch + +from TTS.utils.generic_utils import get_user_data_dir + + +class RenamingUnpickler(pickle_tts.Unpickler): + """Overload default pickler to solve module renaming problem""" + + def find_class(self, module, name): + return super().find_class(module.replace("mozilla_voice_tts", "TTS"), name) + + +class AttrDict(dict): + """A custom dict which converts dict keys + to class attributes""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.__dict__ = self + + +def load_fsspec( + path: str, + map_location: Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]] = None, + cache: bool = True, + **kwargs, +) -> Any: + """Like torch.load but can load from other locations (e.g. s3:// , gs://). + + Args: + path: Any path or url supported by fsspec. + map_location: torch.device or str. + cache: If True, cache a remote file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to True. + **kwargs: Keyword arguments forwarded to torch.load. + + Returns: + Object stored in path. + """ + is_local = os.path.isdir(path) or os.path.isfile(path) + if cache and not is_local: + with fsspec.open( + f"filecache::{path}", + filecache={"cache_storage": str(get_user_data_dir("tts_cache"))}, + mode="rb", + ) as f: + return torch.load(f, map_location=map_location, **kwargs) + else: + with fsspec.open(path, "rb") as f: + return torch.load(f, map_location=map_location, **kwargs) + + +def load_checkpoint( + model, checkpoint_path, use_cuda=False, eval=False, cache=False +): # pylint: disable=redefined-builtin + try: + state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) + except ModuleNotFoundError: + pickle_tts.Unpickler = RenamingUnpickler + state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts, cache=cache) + model.load_state_dict(state["model"]) + if use_cuda: + model.cuda() + if eval: + model.eval() + return model, state diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py new file mode 100644 index 0000000..3a527f4 --- /dev/null +++ b/TTS/utils/manage.py @@ -0,0 +1,621 @@ +import json +import os +import re +import tarfile +import zipfile +from pathlib import Path +from shutil import copyfile, rmtree +from typing import Dict, List, Tuple + +import fsspec +import requests +from tqdm import tqdm + +from TTS.config import load_config, read_json_with_comments +from TTS.utils.generic_utils import get_user_data_dir + +LICENSE_URLS = { + "cc by-nc-nd 4.0": "https://creativecommons.org/licenses/by-nc-nd/4.0/", + "mpl": "https://www.mozilla.org/en-US/MPL/2.0/", + "mpl2": "https://www.mozilla.org/en-US/MPL/2.0/", + "mpl 2.0": "https://www.mozilla.org/en-US/MPL/2.0/", + "mit": "https://choosealicense.com/licenses/mit/", + "apache 2.0": "https://choosealicense.com/licenses/apache-2.0/", + "apache2": "https://choosealicense.com/licenses/apache-2.0/", + "cc-by-sa 4.0": "https://creativecommons.org/licenses/by-sa/4.0/", + "cpml": "https://coqui.ai/cpml.txt", +} + + +class ModelManager(object): + tqdm_progress = None + """Manage TTS models defined in .models.json. + It provides an interface to list and download + models defines in '.model.json' + + Models are downloaded under '.TTS' folder in the user's + home path. + + Args: + models_file (str): path to .model.json file. Defaults to None. + output_prefix (str): prefix to `tts` to download models. Defaults to None + progress_bar (bool): print a progress bar when donwloading a file. Defaults to False. + verbose (bool): print info. Defaults to True. + """ + + def __init__(self, models_file=None, output_prefix=None, progress_bar=False, verbose=True): + super().__init__() + self.progress_bar = progress_bar + self.verbose = verbose + if output_prefix is None: + self.output_prefix = get_user_data_dir("tts") + else: + self.output_prefix = os.path.join(output_prefix, "tts") + self.models_dict = None + if models_file is not None: + self.read_models_file(models_file) + else: + # try the default location + path = Path(__file__).parent / "../.models.json" + self.read_models_file(path) + + def read_models_file(self, file_path): + """Read .models.json as a dict + + Args: + file_path (str): path to .models.json. + """ + self.models_dict = read_json_with_comments(file_path) + + def _list_models(self, model_type, model_count=0): + if self.verbose: + print("\n Name format: type/language/dataset/model") + model_list = [] + for lang in self.models_dict[model_type]: + for dataset in self.models_dict[model_type][lang]: + for model in self.models_dict[model_type][lang][dataset]: + model_full_name = f"{model_type}--{lang}--{dataset}--{model}" + output_path = os.path.join(self.output_prefix, model_full_name) + if self.verbose: + if os.path.exists(output_path): + print(f" {model_count}: {model_type}/{lang}/{dataset}/{model} [already downloaded]") + else: + print(f" {model_count}: {model_type}/{lang}/{dataset}/{model}") + model_list.append(f"{model_type}/{lang}/{dataset}/{model}") + model_count += 1 + return model_list + + def _list_for_model_type(self, model_type): + models_name_list = [] + model_count = 1 + models_name_list.extend(self._list_models(model_type, model_count)) + return models_name_list + + def list_models(self): + models_name_list = [] + model_count = 1 + for model_type in self.models_dict: + model_list = self._list_models(model_type, model_count) + models_name_list.extend(model_list) + return models_name_list + + def model_info_by_idx(self, model_query): + """Print the description of the model from .models.json file using model_idx + + Args: + model_query (str): / + """ + model_name_list = [] + model_type, model_query_idx = model_query.split("/") + try: + model_query_idx = int(model_query_idx) + if model_query_idx <= 0: + print("> model_query_idx should be a positive integer!") + return + except: + print("> model_query_idx should be an integer!") + return + model_count = 0 + if model_type in self.models_dict: + for lang in self.models_dict[model_type]: + for dataset in self.models_dict[model_type][lang]: + for model in self.models_dict[model_type][lang][dataset]: + model_name_list.append(f"{model_type}/{lang}/{dataset}/{model}") + model_count += 1 + else: + print(f"> model_type {model_type} does not exist in the list.") + return + if model_query_idx > model_count: + print(f"model query idx exceeds the number of available models [{model_count}] ") + else: + model_type, lang, dataset, model = model_name_list[model_query_idx - 1].split("/") + print(f"> model type : {model_type}") + print(f"> language supported : {lang}") + print(f"> dataset used : {dataset}") + print(f"> model name : {model}") + if "description" in self.models_dict[model_type][lang][dataset][model]: + print(f"> description : {self.models_dict[model_type][lang][dataset][model]['description']}") + else: + print("> description : coming soon") + if "default_vocoder" in self.models_dict[model_type][lang][dataset][model]: + print(f"> default_vocoder : {self.models_dict[model_type][lang][dataset][model]['default_vocoder']}") + + def model_info_by_full_name(self, model_query_name): + """Print the description of the model from .models.json file using model_full_name + + Args: + model_query_name (str): Format is /// + """ + model_type, lang, dataset, model = model_query_name.split("/") + if model_type in self.models_dict: + if lang in self.models_dict[model_type]: + if dataset in self.models_dict[model_type][lang]: + if model in self.models_dict[model_type][lang][dataset]: + print(f"> model type : {model_type}") + print(f"> language supported : {lang}") + print(f"> dataset used : {dataset}") + print(f"> model name : {model}") + if "description" in self.models_dict[model_type][lang][dataset][model]: + print( + f"> description : {self.models_dict[model_type][lang][dataset][model]['description']}" + ) + else: + print("> description : coming soon") + if "default_vocoder" in self.models_dict[model_type][lang][dataset][model]: + print( + f"> default_vocoder : {self.models_dict[model_type][lang][dataset][model]['default_vocoder']}" + ) + else: + print(f"> model {model} does not exist for {model_type}/{lang}/{dataset}.") + else: + print(f"> dataset {dataset} does not exist for {model_type}/{lang}.") + else: + print(f"> lang {lang} does not exist for {model_type}.") + else: + print(f"> model_type {model_type} does not exist in the list.") + + def list_tts_models(self): + """Print all `TTS` models and return a list of model names + + Format is `language/dataset/model` + """ + return self._list_for_model_type("tts_models") + + def list_vocoder_models(self): + """Print all the `vocoder` models and return a list of model names + + Format is `language/dataset/model` + """ + return self._list_for_model_type("vocoder_models") + + def list_vc_models(self): + """Print all the voice conversion models and return a list of model names + + Format is `language/dataset/model` + """ + return self._list_for_model_type("voice_conversion_models") + + def list_langs(self): + """Print all the available languages""" + print(" Name format: type/language") + for model_type in self.models_dict: + for lang in self.models_dict[model_type]: + print(f" >: {model_type}/{lang} ") + + def list_datasets(self): + """Print all the datasets""" + print(" Name format: type/language/dataset") + for model_type in self.models_dict: + for lang in self.models_dict[model_type]: + for dataset in self.models_dict[model_type][lang]: + print(f" >: {model_type}/{lang}/{dataset}") + + @staticmethod + def print_model_license(model_item: Dict): + """Print the license of a model + + Args: + model_item (dict): model item in the models.json + """ + if "license" in model_item and model_item["license"].strip() != "": + print(f" > Model's license - {model_item['license']}") + if model_item["license"].lower() in LICENSE_URLS: + print(f" > Check {LICENSE_URLS[model_item['license'].lower()]} for more info.") + else: + print(" > Check https://opensource.org/licenses for more info.") + else: + print(" > Model's license - No license information available") + + def _download_github_model(self, model_item: Dict, output_path: str): + if isinstance(model_item["github_rls_url"], list): + self._download_model_files(model_item["github_rls_url"], output_path, self.progress_bar) + else: + self._download_zip_file(model_item["github_rls_url"], output_path, self.progress_bar) + + def _download_hf_model(self, model_item: Dict, output_path: str): + if isinstance(model_item["hf_url"], list): + self._download_model_files(model_item["hf_url"], output_path, self.progress_bar) + else: + self._download_zip_file(model_item["hf_url"], output_path, self.progress_bar) + + def download_fairseq_model(self, model_name, output_path): + URI_PREFIX = "https://coqui.gateway.scarf.sh/fairseq/" + _, lang, _, _ = model_name.split("/") + model_download_uri = os.path.join(URI_PREFIX, f"{lang}.tar.gz") + self._download_tar_file(model_download_uri, output_path, self.progress_bar) + + @staticmethod + def set_model_url(model_item: Dict): + model_item["model_url"] = None + if "github_rls_url" in model_item: + model_item["model_url"] = model_item["github_rls_url"] + elif "hf_url" in model_item: + model_item["model_url"] = model_item["hf_url"] + elif "fairseq" in model_item["model_name"]: + model_item["model_url"] = "https://coqui.gateway.scarf.sh/fairseq/" + elif "xtts" in model_item["model_name"]: + model_item["model_url"] = "https://coqui.gateway.scarf.sh/xtts/" + return model_item + + def _set_model_item(self, model_name): + # fetch model info from the dict + if "fairseq" in model_name: + model_type = "tts_models" + lang = model_name.split("/")[1] + model_item = { + "model_type": "tts_models", + "license": "CC BY-NC 4.0", + "default_vocoder": None, + "author": "fairseq", + "description": "this model is released by Meta under Fairseq repo. Visit https://github.com/facebookresearch/fairseq/tree/main/examples/mms for more info.", + } + model_item["model_name"] = model_name + elif "xtts" in model_name and len(model_name.split("/")) != 4: + # loading xtts models with only model name (e.g. xtts_v2.0.2) + # check model name has the version number with regex + version_regex = r"v\d+\.\d+\.\d+" + if re.search(version_regex, model_name): + model_version = model_name.split("_")[-1] + else: + model_version = "main" + model_type = "tts_models" + lang = "multilingual" + dataset = "multi-dataset" + model = model_name + model_item = { + "default_vocoder": None, + "license": "CPML", + "contact": "info@coqui.ai", + "tos_required": True, + "hf_url": [ + f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/model.pth", + f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/config.json", + f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/vocab.json", + f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/hash.md5", + f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/speakers_xtts.pth", + ], + } + else: + # get model from models.json + model_type, lang, dataset, model = model_name.split("/") + model_item = self.models_dict[model_type][lang][dataset][model] + model_item["model_type"] = model_type + + model_full_name = f"{model_type}--{lang}--{dataset}--{model}" + md5hash = model_item["model_hash"] if "model_hash" in model_item else None + model_item = self.set_model_url(model_item) + return model_item, model_full_name, model, md5hash + + @staticmethod + def ask_tos(model_full_path): + """Ask the user to agree to the terms of service""" + tos_path = os.path.join(model_full_path, "tos_agreed.txt") + print(" > You must confirm the following:") + print(' | > "I have purchased a commercial license from Coqui: licensing@coqui.ai"') + print(' | > "Otherwise, I agree to the terms of the non-commercial CPML: https://coqui.ai/cpml" - [y/n]') + answer = input(" | | > ") + if answer.lower() == "y": + with open(tos_path, "w", encoding="utf-8") as f: + f.write("I have read, understood and agreed to the Terms and Conditions.") + return True + return False + + @staticmethod + def tos_agreed(model_item, model_full_path): + """Check if the user has agreed to the terms of service""" + if "tos_required" in model_item and model_item["tos_required"]: + tos_path = os.path.join(model_full_path, "tos_agreed.txt") + if os.path.exists(tos_path) or os.environ.get("COQUI_TOS_AGREED") == "1": + return True + return False + return True + + def create_dir_and_download_model(self, model_name, model_item, output_path): + os.makedirs(output_path, exist_ok=True) + # handle TOS + if not self.tos_agreed(model_item, output_path): + if not self.ask_tos(output_path): + os.rmdir(output_path) + raise Exception(" [!] You must agree to the terms of service to use this model.") + print(f" > Downloading model to {output_path}") + try: + if "fairseq" in model_name: + self.download_fairseq_model(model_name, output_path) + elif "github_rls_url" in model_item: + self._download_github_model(model_item, output_path) + elif "hf_url" in model_item: + self._download_hf_model(model_item, output_path) + + except requests.RequestException as e: + print(f" > Failed to download the model file to {output_path}") + rmtree(output_path) + raise e + self.print_model_license(model_item=model_item) + + def check_if_configs_are_equal(self, model_name, model_item, output_path): + with fsspec.open(self._find_files(output_path)[1], "r", encoding="utf-8") as f: + config_local = json.load(f) + remote_url = None + for url in model_item["hf_url"]: + if "config.json" in url: + remote_url = url + break + + with fsspec.open(remote_url, "r", encoding="utf-8") as f: + config_remote = json.load(f) + + if not config_local == config_remote: + print(f" > {model_name} is already downloaded however it has been changed. Redownloading it...") + self.create_dir_and_download_model(model_name, model_item, output_path) + + def download_model(self, model_name): + """Download model files given the full model name. + Model name is in the format + 'type/language/dataset/model' + e.g. 'tts_model/en/ljspeech/tacotron' + + Every model must have the following files: + - *.pth : pytorch model checkpoint file. + - config.json : model config file. + - scale_stats.npy (if exist): scale values for preprocessing. + + Args: + model_name (str): model name as explained above. + """ + model_item, model_full_name, model, md5sum = self._set_model_item(model_name) + # set the model specific output path + output_path = os.path.join(self.output_prefix, model_full_name) + if os.path.exists(output_path): + if md5sum is not None: + md5sum_file = os.path.join(output_path, "hash.md5") + if os.path.isfile(md5sum_file): + with open(md5sum_file, mode="r") as f: + if not f.read() == md5sum: + print(f" > {model_name} has been updated, clearing model cache...") + self.create_dir_and_download_model(model_name, model_item, output_path) + else: + print(f" > {model_name} is already downloaded.") + else: + print(f" > {model_name} has been updated, clearing model cache...") + self.create_dir_and_download_model(model_name, model_item, output_path) + # if the configs are different, redownload it + # ToDo: we need a better way to handle it + if "xtts" in model_name: + try: + self.check_if_configs_are_equal(model_name, model_item, output_path) + except: + pass + else: + print(f" > {model_name} is already downloaded.") + else: + self.create_dir_and_download_model(model_name, model_item, output_path) + + # find downloaded files + output_model_path = output_path + output_config_path = None + if ( + model not in ["tortoise-v2", "bark"] and "fairseq" not in model_name and "xtts" not in model_name + ): # TODO:This is stupid but don't care for now. + output_model_path, output_config_path = self._find_files(output_path) + # update paths in the config.json + self._update_paths(output_path, output_config_path) + return output_model_path, output_config_path, model_item + + @staticmethod + def _find_files(output_path: str) -> Tuple[str, str]: + """Find the model and config files in the output path + + Args: + output_path (str): path to the model files + + Returns: + Tuple[str, str]: path to the model file and config file + """ + model_file = None + config_file = None + for file_name in os.listdir(output_path): + if file_name in ["model_file.pth", "model_file.pth.tar", "model.pth"]: + model_file = os.path.join(output_path, file_name) + elif file_name == "config.json": + config_file = os.path.join(output_path, file_name) + if model_file is None: + raise ValueError(" [!] Model file not found in the output path") + if config_file is None: + raise ValueError(" [!] Config file not found in the output path") + return model_file, config_file + + @staticmethod + def _find_speaker_encoder(output_path: str) -> str: + """Find the speaker encoder file in the output path + + Args: + output_path (str): path to the model files + + Returns: + str: path to the speaker encoder file + """ + speaker_encoder_file = None + for file_name in os.listdir(output_path): + if file_name in ["model_se.pth", "model_se.pth.tar"]: + speaker_encoder_file = os.path.join(output_path, file_name) + return speaker_encoder_file + + def _update_paths(self, output_path: str, config_path: str) -> None: + """Update paths for certain files in config.json after download. + + Args: + output_path (str): local path the model is downloaded to. + config_path (str): local config.json path. + """ + output_stats_path = os.path.join(output_path, "scale_stats.npy") + output_d_vector_file_path = os.path.join(output_path, "speakers.json") + output_d_vector_file_pth_path = os.path.join(output_path, "speakers.pth") + output_speaker_ids_file_path = os.path.join(output_path, "speaker_ids.json") + output_speaker_ids_file_pth_path = os.path.join(output_path, "speaker_ids.pth") + speaker_encoder_config_path = os.path.join(output_path, "config_se.json") + speaker_encoder_model_path = self._find_speaker_encoder(output_path) + + # update the scale_path.npy file path in the model config.json + self._update_path("audio.stats_path", output_stats_path, config_path) + + # update the speakers.json file path in the model config.json to the current path + self._update_path("d_vector_file", output_d_vector_file_path, config_path) + self._update_path("d_vector_file", output_d_vector_file_pth_path, config_path) + self._update_path("model_args.d_vector_file", output_d_vector_file_path, config_path) + self._update_path("model_args.d_vector_file", output_d_vector_file_pth_path, config_path) + + # update the speaker_ids.json file path in the model config.json to the current path + self._update_path("speakers_file", output_speaker_ids_file_path, config_path) + self._update_path("speakers_file", output_speaker_ids_file_pth_path, config_path) + self._update_path("model_args.speakers_file", output_speaker_ids_file_path, config_path) + self._update_path("model_args.speakers_file", output_speaker_ids_file_pth_path, config_path) + + # update the speaker_encoder file path in the model config.json to the current path + self._update_path("speaker_encoder_model_path", speaker_encoder_model_path, config_path) + self._update_path("model_args.speaker_encoder_model_path", speaker_encoder_model_path, config_path) + self._update_path("speaker_encoder_config_path", speaker_encoder_config_path, config_path) + self._update_path("model_args.speaker_encoder_config_path", speaker_encoder_config_path, config_path) + + @staticmethod + def _update_path(field_name, new_path, config_path): + """Update the path in the model config.json for the current environment after download""" + if new_path and os.path.exists(new_path): + config = load_config(config_path) + field_names = field_name.split(".") + if len(field_names) > 1: + # field name points to a sub-level field + sub_conf = config + for fd in field_names[:-1]: + if fd in sub_conf: + sub_conf = sub_conf[fd] + else: + return + if isinstance(sub_conf[field_names[-1]], list): + sub_conf[field_names[-1]] = [new_path] + else: + sub_conf[field_names[-1]] = new_path + else: + # field name points to a top-level field + if not field_name in config: + return + if isinstance(config[field_name], list): + config[field_name] = [new_path] + else: + config[field_name] = new_path + config.save_json(config_path) + + @staticmethod + def _download_zip_file(file_url, output_folder, progress_bar): + """Download the github releases""" + # download the file + r = requests.get(file_url, stream=True) + # extract the file + try: + total_size_in_bytes = int(r.headers.get("content-length", 0)) + block_size = 1024 # 1 Kibibyte + if progress_bar: + ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) + temp_zip_name = os.path.join(output_folder, file_url.split("/")[-1]) + with open(temp_zip_name, "wb") as file: + for data in r.iter_content(block_size): + if progress_bar: + ModelManager.tqdm_progress.update(len(data)) + file.write(data) + with zipfile.ZipFile(temp_zip_name) as z: + z.extractall(output_folder) + os.remove(temp_zip_name) # delete zip after extract + except zipfile.BadZipFile: + print(f" > Error: Bad zip file - {file_url}") + raise zipfile.BadZipFile # pylint: disable=raise-missing-from + # move the files to the outer path + for file_path in z.namelist(): + src_path = os.path.join(output_folder, file_path) + if os.path.isfile(src_path): + dst_path = os.path.join(output_folder, os.path.basename(file_path)) + if src_path != dst_path: + copyfile(src_path, dst_path) + # remove redundant (hidden or not) folders + for file_path in z.namelist(): + if os.path.isdir(os.path.join(output_folder, file_path)): + rmtree(os.path.join(output_folder, file_path)) + + @staticmethod + def _download_tar_file(file_url, output_folder, progress_bar): + """Download the github releases""" + # download the file + r = requests.get(file_url, stream=True) + # extract the file + try: + total_size_in_bytes = int(r.headers.get("content-length", 0)) + block_size = 1024 # 1 Kibibyte + if progress_bar: + ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) + temp_tar_name = os.path.join(output_folder, file_url.split("/")[-1]) + with open(temp_tar_name, "wb") as file: + for data in r.iter_content(block_size): + if progress_bar: + ModelManager.tqdm_progress.update(len(data)) + file.write(data) + with tarfile.open(temp_tar_name) as t: + t.extractall(output_folder) + tar_names = t.getnames() + os.remove(temp_tar_name) # delete tar after extract + except tarfile.ReadError: + print(f" > Error: Bad tar file - {file_url}") + raise tarfile.ReadError # pylint: disable=raise-missing-from + # move the files to the outer path + for file_path in os.listdir(os.path.join(output_folder, tar_names[0])): + src_path = os.path.join(output_folder, tar_names[0], file_path) + dst_path = os.path.join(output_folder, os.path.basename(file_path)) + if src_path != dst_path: + copyfile(src_path, dst_path) + # remove the extracted folder + rmtree(os.path.join(output_folder, tar_names[0])) + + @staticmethod + def _download_model_files(file_urls, output_folder, progress_bar): + """Download the github releases""" + for file_url in file_urls: + # download the file + r = requests.get(file_url, stream=True) + # extract the file + bease_filename = file_url.split("/")[-1] + temp_zip_name = os.path.join(output_folder, bease_filename) + total_size_in_bytes = int(r.headers.get("content-length", 0)) + block_size = 1024 # 1 Kibibyte + with open(temp_zip_name, "wb") as file: + if progress_bar: + ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) + for data in r.iter_content(block_size): + if progress_bar: + ModelManager.tqdm_progress.update(len(data)) + file.write(data) + + @staticmethod + def _check_dict_key(my_dict, key): + if key in my_dict.keys() and my_dict[key] is not None: + if not isinstance(key, str): + return True + if isinstance(key, str) and len(my_dict[key]) > 0: + return True + return False diff --git a/TTS/utils/radam.py b/TTS/utils/radam.py new file mode 100644 index 0000000..cbd1499 --- /dev/null +++ b/TTS/utils/radam.py @@ -0,0 +1,105 @@ +# modified from https://github.com/LiyuanLucasLiu/RAdam + +import math + +import torch +from torch.optim.optimizer import Optimizer + + +class RAdam(Optimizer): + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True): + if lr < 0.0: + raise ValueError("Invalid learning rate: {}".format(lr)) + if eps < 0.0: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + + self.degenerated_to_sgd = degenerated_to_sgd + if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict): + for param in params: + if "betas" in param and (param["betas"][0] != betas[0] or param["betas"][1] != betas[1]): + param["buffer"] = [[None, None, None] for _ in range(10)] + defaults = dict( + lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)] + ) + super().__init__(params, defaults) + + def __setstate__(self, state): # pylint: disable=useless-super-delegation + super().__setstate__(state) + + def step(self, closure=None): + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group["params"]: + if p.grad is None: + continue + grad = p.grad.data.float() + if grad.is_sparse: + raise RuntimeError("RAdam does not support sparse gradients") + + p_data_fp32 = p.data.float() + + state = self.state[p] + + if len(state) == 0: + state["step"] = 0 + state["exp_avg"] = torch.zeros_like(p_data_fp32) + state["exp_avg_sq"] = torch.zeros_like(p_data_fp32) + else: + state["exp_avg"] = state["exp_avg"].type_as(p_data_fp32) + state["exp_avg_sq"] = state["exp_avg_sq"].type_as(p_data_fp32) + + exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"] + beta1, beta2 = group["betas"] + + exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) + + state["step"] += 1 + buffered = group["buffer"][int(state["step"] % 10)] + if state["step"] == buffered[0]: + N_sma, step_size = buffered[1], buffered[2] + else: + buffered[0] = state["step"] + beta2_t = beta2 ** state["step"] + N_sma_max = 2 / (1 - beta2) - 1 + N_sma = N_sma_max - 2 * state["step"] * beta2_t / (1 - beta2_t) + buffered[1] = N_sma + + # more conservative since it's an approximated value + if N_sma >= 5: + step_size = math.sqrt( + (1 - beta2_t) + * (N_sma - 4) + / (N_sma_max - 4) + * (N_sma - 2) + / N_sma + * N_sma_max + / (N_sma_max - 2) + ) / (1 - beta1 ** state["step"]) + elif self.degenerated_to_sgd: + step_size = 1.0 / (1 - beta1 ** state["step"]) + else: + step_size = -1 + buffered[2] = step_size + + # more conservative since it's an approximated value + if N_sma >= 5: + if group["weight_decay"] != 0: + p_data_fp32.add_(p_data_fp32, alpha=-group["weight_decay"] * group["lr"]) + denom = exp_avg_sq.sqrt().add_(group["eps"]) + p_data_fp32.addcdiv_(exp_avg, denom, value=-step_size * group["lr"]) + p.data.copy_(p_data_fp32) + elif step_size > 0: + if group["weight_decay"] != 0: + p_data_fp32.add_(p_data_fp32, alpha=-group["weight_decay"] * group["lr"]) + p_data_fp32.add_(exp_avg, alpha=-step_size * group["lr"]) + p.data.copy_(p_data_fp32) + + return loss diff --git a/TTS/utils/samplers.py b/TTS/utils/samplers.py new file mode 100644 index 0000000..b08a763 --- /dev/null +++ b/TTS/utils/samplers.py @@ -0,0 +1,201 @@ +import math +import random +from typing import Callable, List, Union + +from torch.utils.data.sampler import BatchSampler, Sampler, SubsetRandomSampler + + +class SubsetSampler(Sampler): + """ + Samples elements sequentially from a given list of indices. + + Args: + indices (list): a sequence of indices + """ + + def __init__(self, indices): + super().__init__(indices) + self.indices = indices + + def __iter__(self): + return (self.indices[i] for i in range(len(self.indices))) + + def __len__(self): + return len(self.indices) + + +class PerfectBatchSampler(Sampler): + """ + Samples a mini-batch of indices for a balanced class batching + + Args: + dataset_items(list): dataset items to sample from. + classes (list): list of classes of dataset_items to sample from. + batch_size (int): total number of samples to be sampled in a mini-batch. + num_gpus (int): number of GPU in the data parallel mode. + shuffle (bool): if True, samples randomly, otherwise samples sequentially. + drop_last (bool): if True, drops last incomplete batch. + """ + + def __init__( + self, + dataset_items, + classes, + batch_size, + num_classes_in_batch, + num_gpus=1, + shuffle=True, + drop_last=False, + label_key="class_name", + ): + super().__init__(dataset_items) + assert ( + batch_size % (num_classes_in_batch * num_gpus) == 0 + ), "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)." + + label_indices = {} + for idx, item in enumerate(dataset_items): + label = item[label_key] + if label not in label_indices.keys(): + label_indices[label] = [idx] + else: + label_indices[label].append(idx) + + if shuffle: + self._samplers = [SubsetRandomSampler(label_indices[key]) for key in classes] + else: + self._samplers = [SubsetSampler(label_indices[key]) for key in classes] + + self._batch_size = batch_size + self._drop_last = drop_last + self._dp_devices = num_gpus + self._num_classes_in_batch = num_classes_in_batch + + def __iter__(self): + batch = [] + if self._num_classes_in_batch != len(self._samplers): + valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch) + else: + valid_samplers_idx = None + + iters = [iter(s) for s in self._samplers] + done = False + + while True: + b = [] + for i, it in enumerate(iters): + if valid_samplers_idx is not None and i not in valid_samplers_idx: + continue + idx = next(it, None) + if idx is None: + done = True + break + b.append(idx) + if done: + break + batch += b + if len(batch) == self._batch_size: + yield batch + batch = [] + if valid_samplers_idx is not None: + valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch) + + if not self._drop_last: + if len(batch) > 0: + groups = len(batch) // self._num_classes_in_batch + if groups % self._dp_devices == 0: + yield batch + else: + batch = batch[: (groups // self._dp_devices) * self._dp_devices * self._num_classes_in_batch] + if len(batch) > 0: + yield batch + + def __len__(self): + class_batch_size = self._batch_size // self._num_classes_in_batch + return min(((len(s) + class_batch_size - 1) // class_batch_size) for s in self._samplers) + + +def identity(x): + return x + + +class SortedSampler(Sampler): + """Samples elements sequentially, always in the same order. + + Taken from https://github.com/PetrochukM/PyTorch-NLP + + Args: + data (iterable): Iterable data. + sort_key (callable): Specifies a function of one argument that is used to extract a + numerical comparison key from each list element. + + Example: + >>> list(SortedSampler(range(10), sort_key=lambda i: -i)) + [9, 8, 7, 6, 5, 4, 3, 2, 1, 0] + + """ + + def __init__(self, data, sort_key: Callable = identity): + super().__init__(data) + self.data = data + self.sort_key = sort_key + zip_ = [(i, self.sort_key(row)) for i, row in enumerate(self.data)] + zip_ = sorted(zip_, key=lambda r: r[1]) + self.sorted_indexes = [item[0] for item in zip_] + + def __iter__(self): + return iter(self.sorted_indexes) + + def __len__(self): + return len(self.data) + + +class BucketBatchSampler(BatchSampler): + """Bucket batch sampler + + Adapted from https://github.com/PetrochukM/PyTorch-NLP + + Args: + sampler (torch.data.utils.sampler.Sampler): + batch_size (int): Size of mini-batch. + drop_last (bool): If `True` the sampler will drop the last batch if its size would be less + than `batch_size`. + data (list): List of data samples. + sort_key (callable, optional): Callable to specify a comparison key for sorting. + bucket_size_multiplier (int, optional): Buckets are of size + `batch_size * bucket_size_multiplier`. + + Example: + >>> sampler = WeightedRandomSampler(weights, len(weights)) + >>> sampler = BucketBatchSampler(sampler, data=data_items, batch_size=32, drop_last=True) + """ + + def __init__( + self, + sampler, + data, + batch_size, + drop_last, + sort_key: Union[Callable, List] = identity, + bucket_size_multiplier=100, + ): + super().__init__(sampler, batch_size, drop_last) + self.data = data + self.sort_key = sort_key + _bucket_size = batch_size * bucket_size_multiplier + if hasattr(sampler, "__len__"): + _bucket_size = min(_bucket_size, len(sampler)) + self.bucket_sampler = BatchSampler(sampler, _bucket_size, False) + + def __iter__(self): + for idxs in self.bucket_sampler: + bucket_data = [self.data[idx] for idx in idxs] + sorted_sampler = SortedSampler(bucket_data, self.sort_key) + for batch_idx in SubsetRandomSampler(list(BatchSampler(sorted_sampler, self.batch_size, self.drop_last))): + sorted_idxs = [idxs[i] for i in batch_idx] + yield sorted_idxs + + def __len__(self): + if self.drop_last: + return len(self.sampler) // self.batch_size + return math.ceil(len(self.sampler) / self.batch_size) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py new file mode 100644 index 0000000..b98647c --- /dev/null +++ b/TTS/utils/synthesizer.py @@ -0,0 +1,505 @@ +import os +import time +from typing import List + +import numpy as np +import pysbd +import torch +from torch import nn + +from TTS.config import load_config +from TTS.tts.configs.vits_config import VitsConfig +from TTS.tts.models import setup_model as setup_tts_model +from TTS.tts.models.vits import Vits + +# pylint: disable=unused-wildcard-import +# pylint: disable=wildcard-import +from TTS.tts.utils.synthesis import synthesis, transfer_voice, trim_silence +from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.numpy_transforms import save_wav +from TTS.vc.models import setup_model as setup_vc_model +from TTS.vocoder.models import setup_model as setup_vocoder_model +from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input + + +class Synthesizer(nn.Module): + def __init__( + self, + tts_checkpoint: str = "", + tts_config_path: str = "", + tts_speakers_file: str = "", + tts_languages_file: str = "", + vocoder_checkpoint: str = "", + vocoder_config: str = "", + encoder_checkpoint: str = "", + encoder_config: str = "", + vc_checkpoint: str = "", + vc_config: str = "", + model_dir: str = "", + voice_dir: str = None, + use_cuda: bool = False, + ) -> None: + """General 🐸 TTS interface for inference. It takes a tts and a vocoder + model and synthesize speech from the provided text. + + The text is divided into a list of sentences using `pysbd` and synthesize + speech on each sentence separately. + + If you have certain special characters in your text, you need to handle + them before providing the text to Synthesizer. + + TODO: set the segmenter based on the source language + + Args: + tts_checkpoint (str, optional): path to the tts model file. + tts_config_path (str, optional): path to the tts config file. + vocoder_checkpoint (str, optional): path to the vocoder model file. Defaults to None. + vocoder_config (str, optional): path to the vocoder config file. Defaults to None. + encoder_checkpoint (str, optional): path to the speaker encoder model file. Defaults to `""`, + encoder_config (str, optional): path to the speaker encoder config file. Defaults to `""`, + vc_checkpoint (str, optional): path to the voice conversion model file. Defaults to `""`, + vc_config (str, optional): path to the voice conversion config file. Defaults to `""`, + use_cuda (bool, optional): enable/disable cuda. Defaults to False. + """ + super().__init__() + self.tts_checkpoint = tts_checkpoint + self.tts_config_path = tts_config_path + self.tts_speakers_file = tts_speakers_file + self.tts_languages_file = tts_languages_file + self.vocoder_checkpoint = vocoder_checkpoint + self.vocoder_config = vocoder_config + self.encoder_checkpoint = encoder_checkpoint + self.encoder_config = encoder_config + self.vc_checkpoint = vc_checkpoint + self.vc_config = vc_config + self.use_cuda = use_cuda + + self.tts_model = None + self.vocoder_model = None + self.vc_model = None + self.speaker_manager = None + self.tts_speakers = {} + self.language_manager = None + self.num_languages = 0 + self.tts_languages = {} + self.d_vector_dim = 0 + self.seg = self._get_segmenter("en") + self.use_cuda = use_cuda + self.voice_dir = voice_dir + if self.use_cuda: + assert torch.cuda.is_available(), "CUDA is not availabe on this machine." + + if tts_checkpoint: + self._load_tts(tts_checkpoint, tts_config_path, use_cuda) + self.output_sample_rate = self.tts_config.audio["sample_rate"] + + if vocoder_checkpoint: + self._load_vocoder(vocoder_checkpoint, vocoder_config, use_cuda) + self.output_sample_rate = self.vocoder_config.audio["sample_rate"] + + if vc_checkpoint: + self._load_vc(vc_checkpoint, vc_config, use_cuda) + self.output_sample_rate = self.vc_config.audio["output_sample_rate"] + + if model_dir: + if "fairseq" in model_dir: + self._load_fairseq_from_dir(model_dir, use_cuda) + self.output_sample_rate = self.tts_config.audio["sample_rate"] + else: + self._load_tts_from_dir(model_dir, use_cuda) + self.output_sample_rate = self.tts_config.audio["output_sample_rate"] + + @staticmethod + def _get_segmenter(lang: str): + """get the sentence segmenter for the given language. + + Args: + lang (str): target language code. + + Returns: + [type]: [description] + """ + return pysbd.Segmenter(language=lang, clean=True) + + def _load_vc(self, vc_checkpoint: str, vc_config_path: str, use_cuda: bool) -> None: + """Load the voice conversion model. + + 1. Load the model config. + 2. Init the model from the config. + 3. Load the model weights. + 4. Move the model to the GPU if CUDA is enabled. + + Args: + vc_checkpoint (str): path to the model checkpoint. + tts_config_path (str): path to the model config file. + use_cuda (bool): enable/disable CUDA use. + """ + # pylint: disable=global-statement + self.vc_config = load_config(vc_config_path) + self.vc_model = setup_vc_model(config=self.vc_config) + self.vc_model.load_checkpoint(self.vc_config, vc_checkpoint) + if use_cuda: + self.vc_model.cuda() + + def _load_fairseq_from_dir(self, model_dir: str, use_cuda: bool) -> None: + """Load the fairseq model from a directory. + + We assume it is VITS and the model knows how to load itself from the directory and there is a config.json file in the directory. + """ + self.tts_config = VitsConfig() + self.tts_model = Vits.init_from_config(self.tts_config) + self.tts_model.load_fairseq_checkpoint(self.tts_config, checkpoint_dir=model_dir, eval=True) + self.tts_config = self.tts_model.config + if use_cuda: + self.tts_model.cuda() + + def _load_tts_from_dir(self, model_dir: str, use_cuda: bool) -> None: + """Load the TTS model from a directory. + + We assume the model knows how to load itself from the directory and there is a config.json file in the directory. + """ + config = load_config(os.path.join(model_dir, "config.json")) + self.tts_config = config + self.tts_model = setup_tts_model(config) + self.tts_model.load_checkpoint(config, checkpoint_dir=model_dir, eval=True) + if use_cuda: + self.tts_model.cuda() + + def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -> None: + """Load the TTS model. + + 1. Load the model config. + 2. Init the model from the config. + 3. Load the model weights. + 4. Move the model to the GPU if CUDA is enabled. + 5. Init the speaker manager in the model. + + Args: + tts_checkpoint (str): path to the model checkpoint. + tts_config_path (str): path to the model config file. + use_cuda (bool): enable/disable CUDA use. + """ + # pylint: disable=global-statement + self.tts_config = load_config(tts_config_path) + if self.tts_config["use_phonemes"] and self.tts_config["phonemizer"] is None: + raise ValueError("Phonemizer is not defined in the TTS config.") + + self.tts_model = setup_tts_model(config=self.tts_config) + + if not self.encoder_checkpoint: + self._set_speaker_encoder_paths_from_tts_config() + + self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) + if use_cuda: + self.tts_model.cuda() + + if self.encoder_checkpoint and hasattr(self.tts_model, "speaker_manager"): + self.tts_model.speaker_manager.init_encoder(self.encoder_checkpoint, self.encoder_config, use_cuda) + + def _set_speaker_encoder_paths_from_tts_config(self): + """Set the encoder paths from the tts model config for models with speaker encoders.""" + if hasattr(self.tts_config, "model_args") and hasattr( + self.tts_config.model_args, "speaker_encoder_config_path" + ): + self.encoder_checkpoint = self.tts_config.model_args.speaker_encoder_model_path + self.encoder_config = self.tts_config.model_args.speaker_encoder_config_path + + def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> None: + """Load the vocoder model. + + 1. Load the vocoder config. + 2. Init the AudioProcessor for the vocoder. + 3. Init the vocoder model from the config. + 4. Move the model to the GPU if CUDA is enabled. + + Args: + model_file (str): path to the model checkpoint. + model_config (str): path to the model config file. + use_cuda (bool): enable/disable CUDA use. + """ + self.vocoder_config = load_config(model_config) + self.vocoder_ap = AudioProcessor(verbose=False, **self.vocoder_config.audio) + self.vocoder_model = setup_vocoder_model(self.vocoder_config) + self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True) + if use_cuda: + self.vocoder_model.cuda() + + def split_into_sentences(self, text) -> List[str]: + """Split give text into sentences. + + Args: + text (str): input text in string format. + + Returns: + List[str]: list of sentences. + """ + return self.seg.segment(text) + + def save_wav(self, wav: List[int], path: str, pipe_out=None) -> None: + """Save the waveform as a file. + + Args: + wav (List[int]): waveform as a list of values. + path (str): output path to save the waveform. + pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. + """ + # if tensor convert to numpy + if torch.is_tensor(wav): + wav = wav.cpu().numpy() + if isinstance(wav, list): + wav = np.array(wav) + save_wav(wav=wav, path=path, sample_rate=self.output_sample_rate, pipe_out=pipe_out) + + def voice_conversion(self, source_wav: str, target_wav: str) -> List[int]: + output_wav = self.vc_model.voice_conversion(source_wav, target_wav) + return output_wav + + def tts( + self, + text: str = "", + speaker_name: str = "", + language_name: str = "", + speaker_wav=None, + style_wav=None, + style_text=None, + reference_wav=None, + reference_speaker_name=None, + split_sentences: bool = True, + **kwargs, + ) -> List[int]: + """🐸 TTS magic. Run all the models and generate speech. + + Args: + text (str): input text. + speaker_name (str, optional): speaker id for multi-speaker models. Defaults to "". + language_name (str, optional): language id for multi-language models. Defaults to "". + speaker_wav (Union[str, List[str]], optional): path to the speaker wav for voice cloning. Defaults to None. + style_wav ([type], optional): style waveform for GST. Defaults to None. + style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None. + reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None. + reference_speaker_name ([type], optional): speaker id of reference waveform. Defaults to None. + split_sentences (bool, optional): split the input text into sentences. Defaults to True. + **kwargs: additional arguments to pass to the TTS model. + Returns: + List[int]: [description] + """ + start_time = time.time() + wavs = [] + + if not text and not reference_wav: + raise ValueError( + "You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API." + ) + + if text: + sens = [text] + if split_sentences: + print(" > Text splitted to sentences.") + sens = self.split_into_sentences(text) + print(sens) + + # handle multi-speaker + if "voice_dir" in kwargs: + self.voice_dir = kwargs["voice_dir"] + kwargs.pop("voice_dir") + speaker_embedding = None + speaker_id = None + if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"): + if speaker_name and isinstance(speaker_name, str) and not self.tts_config.model == "xtts": + if self.tts_config.use_d_vector_file: + # get the average speaker embedding from the saved d_vectors. + speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding( + speaker_name, num_samples=None, randomize=False + ) + speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim] + else: + # get speaker idx from the speaker name + speaker_id = self.tts_model.speaker_manager.name_to_id[speaker_name] + # handle Neon models with single speaker. + elif len(self.tts_model.speaker_manager.name_to_id) == 1: + speaker_id = list(self.tts_model.speaker_manager.name_to_id.values())[0] + elif not speaker_name and not speaker_wav: + raise ValueError( + " [!] Looks like you are using a multi-speaker model. " + "You need to define either a `speaker_idx` or a `speaker_wav` to use a multi-speaker model." + ) + else: + speaker_embedding = None + else: + if speaker_name and self.voice_dir is None: + raise ValueError( + f" [!] Missing speakers.json file path for selecting speaker {speaker_name}." + "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. " + ) + + # handle multi-lingual + language_id = None + if self.tts_languages_file or ( + hasattr(self.tts_model, "language_manager") + and self.tts_model.language_manager is not None + and not self.tts_config.model == "xtts" + ): + if len(self.tts_model.language_manager.name_to_id) == 1: + language_id = list(self.tts_model.language_manager.name_to_id.values())[0] + + elif language_name and isinstance(language_name, str): + try: + language_id = self.tts_model.language_manager.name_to_id[language_name] + except KeyError as e: + raise ValueError( + f" [!] Looks like you use a multi-lingual model. " + f"Language {language_name} is not in the available languages: " + f"{self.tts_model.language_manager.name_to_id.keys()}." + ) from e + + elif not language_name: + raise ValueError( + " [!] Look like you use a multi-lingual model. " + "You need to define either a `language_name` or a `style_wav` to use a multi-lingual model." + ) + + else: + raise ValueError( + f" [!] Missing language_ids.json file path for selecting language {language_name}." + "Define path for language_ids.json if it is a multi-lingual model or remove defined language idx. " + ) + + # compute a new d_vector from the given clip. + if ( + speaker_wav is not None + and self.tts_model.speaker_manager is not None + and hasattr(self.tts_model.speaker_manager, "encoder_ap") + and self.tts_model.speaker_manager.encoder_ap is not None + ): + speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav) + + vocoder_device = "cpu" + use_gl = self.vocoder_model is None + if not use_gl: + vocoder_device = next(self.vocoder_model.parameters()).device + if self.use_cuda: + vocoder_device = "cuda" + + if not reference_wav: # not voice conversion + for sen in sens: + if hasattr(self.tts_model, "synthesize"): + outputs = self.tts_model.synthesize( + text=sen, + config=self.tts_config, + speaker_id=speaker_name, + voice_dirs=self.voice_dir, + d_vector=speaker_embedding, + speaker_wav=speaker_wav, + language=language_name, + **kwargs, + ) + else: + # synthesize voice + outputs = synthesis( + model=self.tts_model, + text=sen, + CONFIG=self.tts_config, + use_cuda=self.use_cuda, + speaker_id=speaker_id, + style_wav=style_wav, + style_text=style_text, + use_griffin_lim=use_gl, + d_vector=speaker_embedding, + language_id=language_id, + ) + waveform = outputs["wav"] + if not use_gl: + mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach().cpu().numpy() + # denormalize tts output based on tts audio config + mel_postnet_spec = self.tts_model.ap.denormalize(mel_postnet_spec.T).T + # renormalize spectrogram based on vocoder config + vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T) + # compute scale factor for possible sample rate mismatch + scale_factor = [ + 1, + self.vocoder_config["audio"]["sample_rate"] / self.tts_model.ap.sample_rate, + ] + if scale_factor[1] != 1: + print(" > interpolating tts model output.") + vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input) + else: + vocoder_input = torch.tensor(vocoder_input).unsqueeze(0) # pylint: disable=not-callable + # run vocoder model + # [1, T, C] + waveform = self.vocoder_model.inference(vocoder_input.to(vocoder_device)) + if torch.is_tensor(waveform) and waveform.device != torch.device("cpu") and not use_gl: + waveform = waveform.cpu() + if not use_gl: + waveform = waveform.numpy() + waveform = waveform.squeeze() + + # trim silence + if "do_trim_silence" in self.tts_config.audio and self.tts_config.audio["do_trim_silence"]: + waveform = trim_silence(waveform, self.tts_model.ap) + + wavs += list(waveform) + wavs += [0] * 10000 + else: + # get the speaker embedding or speaker id for the reference wav file + reference_speaker_embedding = None + reference_speaker_id = None + if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"): + if reference_speaker_name and isinstance(reference_speaker_name, str): + if self.tts_config.use_d_vector_file: + # get the speaker embedding from the saved d_vectors. + reference_speaker_embedding = self.tts_model.speaker_manager.get_embeddings_by_name( + reference_speaker_name + )[0] + reference_speaker_embedding = np.array(reference_speaker_embedding)[ + None, : + ] # [1 x embedding_dim] + else: + # get speaker idx from the speaker name + reference_speaker_id = self.tts_model.speaker_manager.name_to_id[reference_speaker_name] + else: + reference_speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip( + reference_wav + ) + outputs = transfer_voice( + model=self.tts_model, + CONFIG=self.tts_config, + use_cuda=self.use_cuda, + reference_wav=reference_wav, + speaker_id=speaker_id, + d_vector=speaker_embedding, + use_griffin_lim=use_gl, + reference_speaker_id=reference_speaker_id, + reference_d_vector=reference_speaker_embedding, + ) + waveform = outputs + if not use_gl: + mel_postnet_spec = outputs[0].detach().cpu().numpy() + # denormalize tts output based on tts audio config + mel_postnet_spec = self.tts_model.ap.denormalize(mel_postnet_spec.T).T + # renormalize spectrogram based on vocoder config + vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T) + # compute scale factor for possible sample rate mismatch + scale_factor = [ + 1, + self.vocoder_config["audio"]["sample_rate"] / self.tts_model.ap.sample_rate, + ] + if scale_factor[1] != 1: + print(" > interpolating tts model output.") + vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input) + else: + vocoder_input = torch.tensor(vocoder_input).unsqueeze(0) # pylint: disable=not-callable + # run vocoder model + # [1, T, C] + waveform = self.vocoder_model.inference(vocoder_input.to(vocoder_device)) + if torch.is_tensor(waveform) and waveform.device != torch.device("cpu"): + waveform = waveform.cpu() + if not use_gl: + waveform = waveform.numpy() + wavs = waveform.squeeze() + + # compute stats + process_time = time.time() - start_time + audio_time = len(wavs) / self.tts_config.audio["sample_rate"] + print(f" > Processing time: {process_time}") + print(f" > Real-time factor: {process_time / audio_time}") + return wavs diff --git a/TTS/utils/training.py b/TTS/utils/training.py new file mode 100644 index 0000000..b51f55e --- /dev/null +++ b/TTS/utils/training.py @@ -0,0 +1,44 @@ +import numpy as np +import torch + + +def check_update(model, grad_clip, ignore_stopnet=False, amp_opt_params=None): + r"""Check model gradient against unexpected jumps and failures""" + skip_flag = False + if ignore_stopnet: + if not amp_opt_params: + grad_norm = torch.nn.utils.clip_grad_norm_( + [param for name, param in model.named_parameters() if "stopnet" not in name], grad_clip + ) + else: + grad_norm = torch.nn.utils.clip_grad_norm_(amp_opt_params, grad_clip) + else: + if not amp_opt_params: + grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) + else: + grad_norm = torch.nn.utils.clip_grad_norm_(amp_opt_params, grad_clip) + + # compatibility with different torch versions + if isinstance(grad_norm, float): + if np.isinf(grad_norm): + print(" | > Gradient is INF !!") + skip_flag = True + else: + if torch.isinf(grad_norm): + print(" | > Gradient is INF !!") + skip_flag = True + return grad_norm, skip_flag + + +def gradual_training_scheduler(global_step, config): + """Setup the gradual training schedule wrt number + of active GPUs""" + num_gpus = torch.cuda.device_count() + if num_gpus == 0: + num_gpus = 1 + new_values = None + # we set the scheduling wrt num_gpus + for values in config.gradual_training: + if global_step * num_gpus >= values[0]: + new_values = values + return new_values[1], new_values[2] diff --git a/TTS/utils/vad.py b/TTS/utils/vad.py new file mode 100644 index 0000000..aefce2b --- /dev/null +++ b/TTS/utils/vad.py @@ -0,0 +1,88 @@ +import torch +import torchaudio + + +def read_audio(path): + wav, sr = torchaudio.load(path) + + if wav.size(0) > 1: + wav = wav.mean(dim=0, keepdim=True) + + return wav.squeeze(0), sr + + +def resample_wav(wav, sr, new_sr): + wav = wav.unsqueeze(0) + transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=new_sr) + wav = transform(wav) + return wav.squeeze(0) + + +def map_timestamps_to_new_sr(vad_sr, new_sr, timestamps, just_begging_end=False): + factor = new_sr / vad_sr + new_timestamps = [] + if just_begging_end and timestamps: + # get just the start and end timestamps + new_dict = {"start": int(timestamps[0]["start"] * factor), "end": int(timestamps[-1]["end"] * factor)} + new_timestamps.append(new_dict) + else: + for ts in timestamps: + # map to the new SR + new_dict = {"start": int(ts["start"] * factor), "end": int(ts["end"] * factor)} + new_timestamps.append(new_dict) + + return new_timestamps + + +def get_vad_model_and_utils(use_cuda=False, use_onnx=False): + model, utils = torch.hub.load( + repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=True, onnx=use_onnx, force_onnx_cpu=True + ) + if use_cuda: + model = model.cuda() + + get_speech_timestamps, save_audio, _, _, collect_chunks = utils + return model, get_speech_timestamps, save_audio, collect_chunks + + +def remove_silence( + model_and_utils, audio_path, out_path, vad_sample_rate=8000, trim_just_beginning_and_end=True, use_cuda=False +): + # get the VAD model and utils functions + model, get_speech_timestamps, _, collect_chunks = model_and_utils + + # read ground truth wav and resample the audio for the VAD + try: + wav, gt_sample_rate = read_audio(audio_path) + except: + print(f"> ❗ Failed to read {audio_path}") + return None, False + + # if needed, resample the audio for the VAD model + if gt_sample_rate != vad_sample_rate: + wav_vad = resample_wav(wav, gt_sample_rate, vad_sample_rate) + else: + wav_vad = wav + + if use_cuda: + wav_vad = wav_vad.cuda() + + # get speech timestamps from full audio file + speech_timestamps = get_speech_timestamps(wav_vad, model, sampling_rate=vad_sample_rate, window_size_samples=768) + + # map the current speech_timestamps to the sample rate of the ground truth audio + new_speech_timestamps = map_timestamps_to_new_sr( + vad_sample_rate, gt_sample_rate, speech_timestamps, trim_just_beginning_and_end + ) + + # if have speech timestamps else save the wav + if new_speech_timestamps: + wav = collect_chunks(new_speech_timestamps, wav) + is_speech = True + else: + print(f"> The file {audio_path} probably does not have speech please check it !!") + is_speech = False + + # save + torchaudio.save(out_path, wav[None, :], gt_sample_rate) + return out_path, is_speech