Add files via upload
This commit is contained in:
@@ -0,0 +1,48 @@
|
||||
"""Get detailed info about the working environment."""
|
||||
import os
|
||||
import platform
|
||||
import sys
|
||||
|
||||
import numpy
|
||||
import torch
|
||||
|
||||
sys.path += [os.path.abspath(".."), os.path.abspath(".")]
|
||||
import json
|
||||
|
||||
import TTS
|
||||
|
||||
|
||||
def system_info():
|
||||
return {
|
||||
"OS": platform.system(),
|
||||
"architecture": platform.architecture(),
|
||||
"version": platform.version(),
|
||||
"processor": platform.processor(),
|
||||
"python": platform.python_version(),
|
||||
}
|
||||
|
||||
|
||||
def cuda_info():
|
||||
return {
|
||||
"GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
|
||||
"available": torch.cuda.is_available(),
|
||||
"version": torch.version.cuda,
|
||||
}
|
||||
|
||||
|
||||
def package_info():
|
||||
return {
|
||||
"numpy": numpy.__version__,
|
||||
"PyTorch_version": torch.__version__,
|
||||
"PyTorch_debug": torch.version.debug,
|
||||
"TTS": TTS.__version__,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()}
|
||||
print(json.dumps(details, indent=4, sort_keys=True))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,165 @@
|
||||
import argparse
|
||||
import importlib
|
||||
import os
|
||||
from argparse import RawTextHelpFormatter
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from tqdm import tqdm
|
||||
|
||||
from TTS.config import load_config
|
||||
from TTS.tts.datasets.TTSDataset import TTSDataset
|
||||
from TTS.tts.models import setup_model
|
||||
from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.io import load_checkpoint
|
||||
|
||||
if __name__ == "__main__":
|
||||
# pylint: disable=bad-option-value
|
||||
parser = argparse.ArgumentParser(
|
||||
description="""Extract attention masks from trained Tacotron/Tacotron2 models.
|
||||
These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n"""
|
||||
"""Each attention mask is written to the same path as the input wav file with ".npy" file extension.
|
||||
(e.g. path/bla.wav (wav file) --> path/bla.npy (attention mask))\n"""
|
||||
"""
|
||||
Example run:
|
||||
CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
|
||||
--model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth
|
||||
--config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
|
||||
--dataset_metafile metadata.csv
|
||||
--data_path /root/LJSpeech-1.1/
|
||||
--batch_size 32
|
||||
--dataset ljspeech
|
||||
--use_cuda True
|
||||
""",
|
||||
formatter_class=RawTextHelpFormatter,
|
||||
)
|
||||
parser.add_argument("--model_path", type=str, required=True, help="Path to Tacotron/Tacotron2 model file ")
|
||||
parser.add_argument(
|
||||
"--config_path",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to Tacotron/Tacotron2 config file.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
type=str,
|
||||
default="",
|
||||
required=True,
|
||||
help="Target dataset processor name from TTS.tts.dataset.preprocess.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--dataset_metafile",
|
||||
type=str,
|
||||
default="",
|
||||
required=True,
|
||||
help="Dataset metafile inclusing file paths with transcripts.",
|
||||
)
|
||||
parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
|
||||
parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.")
|
||||
|
||||
parser.add_argument(
|
||||
"--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
C = load_config(args.config_path)
|
||||
ap = AudioProcessor(**C.audio)
|
||||
|
||||
# if the vocabulary was passed, replace the default
|
||||
if "characters" in C.keys():
|
||||
symbols, phonemes = make_symbols(**C.characters)
|
||||
|
||||
# load the model
|
||||
num_chars = len(phonemes) if C.use_phonemes else len(symbols)
|
||||
# TODO: handle multi-speaker
|
||||
model = setup_model(C)
|
||||
model, _ = load_checkpoint(model, args.model_path, args.use_cuda, True)
|
||||
|
||||
# data loader
|
||||
preprocessor = importlib.import_module("TTS.tts.datasets.formatters")
|
||||
preprocessor = getattr(preprocessor, args.dataset)
|
||||
meta_data = preprocessor(args.data_path, args.dataset_metafile)
|
||||
dataset = TTSDataset(
|
||||
model.decoder.r,
|
||||
C.text_cleaner,
|
||||
compute_linear_spec=False,
|
||||
ap=ap,
|
||||
meta_data=meta_data,
|
||||
characters=C.characters if "characters" in C.keys() else None,
|
||||
add_blank=C["add_blank"] if "add_blank" in C.keys() else False,
|
||||
use_phonemes=C.use_phonemes,
|
||||
phoneme_cache_path=C.phoneme_cache_path,
|
||||
phoneme_language=C.phoneme_language,
|
||||
enable_eos_bos=C.enable_eos_bos_chars,
|
||||
)
|
||||
|
||||
dataset.sort_and_filter_items(C.get("sort_by_audio_len", default=False))
|
||||
loader = DataLoader(
|
||||
dataset,
|
||||
batch_size=args.batch_size,
|
||||
num_workers=4,
|
||||
collate_fn=dataset.collate_fn,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
)
|
||||
|
||||
# compute attentions
|
||||
file_paths = []
|
||||
with torch.no_grad():
|
||||
for data in tqdm(loader):
|
||||
# setup input data
|
||||
text_input = data[0]
|
||||
text_lengths = data[1]
|
||||
linear_input = data[3]
|
||||
mel_input = data[4]
|
||||
mel_lengths = data[5]
|
||||
stop_targets = data[6]
|
||||
item_idxs = data[7]
|
||||
|
||||
# dispatch data to GPU
|
||||
if args.use_cuda:
|
||||
text_input = text_input.cuda()
|
||||
text_lengths = text_lengths.cuda()
|
||||
mel_input = mel_input.cuda()
|
||||
mel_lengths = mel_lengths.cuda()
|
||||
|
||||
model_outputs = model.forward(text_input, text_lengths, mel_input)
|
||||
|
||||
alignments = model_outputs["alignments"].detach()
|
||||
for idx, alignment in enumerate(alignments):
|
||||
item_idx = item_idxs[idx]
|
||||
# interpolate if r > 1
|
||||
alignment = (
|
||||
torch.nn.functional.interpolate(
|
||||
alignment.transpose(0, 1).unsqueeze(0),
|
||||
size=None,
|
||||
scale_factor=model.decoder.r,
|
||||
mode="nearest",
|
||||
align_corners=None,
|
||||
recompute_scale_factor=None,
|
||||
)
|
||||
.squeeze(0)
|
||||
.transpose(0, 1)
|
||||
)
|
||||
# remove paddings
|
||||
alignment = alignment[: mel_lengths[idx], : text_lengths[idx]].cpu().numpy()
|
||||
# set file paths
|
||||
wav_file_name = os.path.basename(item_idx)
|
||||
align_file_name = os.path.splitext(wav_file_name)[0] + "_attn.npy"
|
||||
file_path = item_idx.replace(wav_file_name, align_file_name)
|
||||
# save output
|
||||
wav_file_abs_path = os.path.abspath(item_idx)
|
||||
file_abs_path = os.path.abspath(file_path)
|
||||
file_paths.append([wav_file_abs_path, file_abs_path])
|
||||
np.save(file_path, alignment)
|
||||
|
||||
# ourput metafile
|
||||
metafile = os.path.join(args.data_path, "metadata_attn_mask.txt")
|
||||
|
||||
with open(metafile, "w", encoding="utf-8") as f:
|
||||
for p in file_paths:
|
||||
f.write(f"{p[0]}|{p[1]}\n")
|
||||
print(f" >> Metafile created: {metafile}")
|
||||
@@ -0,0 +1,197 @@
|
||||
import argparse
|
||||
import os
|
||||
from argparse import RawTextHelpFormatter
|
||||
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from TTS.config import load_config
|
||||
from TTS.config.shared_configs import BaseDatasetConfig
|
||||
from TTS.tts.datasets import load_tts_samples
|
||||
from TTS.tts.utils.managers import save_file
|
||||
from TTS.tts.utils.speakers import SpeakerManager
|
||||
|
||||
|
||||
def compute_embeddings(
|
||||
model_path,
|
||||
config_path,
|
||||
output_path,
|
||||
old_speakers_file=None,
|
||||
old_append=False,
|
||||
config_dataset_path=None,
|
||||
formatter_name=None,
|
||||
dataset_name=None,
|
||||
dataset_path=None,
|
||||
meta_file_train=None,
|
||||
meta_file_val=None,
|
||||
disable_cuda=False,
|
||||
no_eval=False,
|
||||
):
|
||||
use_cuda = torch.cuda.is_available() and not disable_cuda
|
||||
|
||||
if config_dataset_path is not None:
|
||||
c_dataset = load_config(config_dataset_path)
|
||||
meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not no_eval)
|
||||
else:
|
||||
c_dataset = BaseDatasetConfig()
|
||||
c_dataset.formatter = formatter_name
|
||||
c_dataset.dataset_name = dataset_name
|
||||
c_dataset.path = dataset_path
|
||||
if meta_file_train is not None:
|
||||
c_dataset.meta_file_train = meta_file_train
|
||||
if meta_file_val is not None:
|
||||
c_dataset.meta_file_val = meta_file_val
|
||||
meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not no_eval)
|
||||
|
||||
if meta_data_eval is None:
|
||||
samples = meta_data_train
|
||||
else:
|
||||
samples = meta_data_train + meta_data_eval
|
||||
|
||||
encoder_manager = SpeakerManager(
|
||||
encoder_model_path=model_path,
|
||||
encoder_config_path=config_path,
|
||||
d_vectors_file_path=old_speakers_file,
|
||||
use_cuda=use_cuda,
|
||||
)
|
||||
|
||||
class_name_key = encoder_manager.encoder_config.class_name_key
|
||||
|
||||
# compute speaker embeddings
|
||||
if old_speakers_file is not None and old_append:
|
||||
speaker_mapping = encoder_manager.embeddings
|
||||
else:
|
||||
speaker_mapping = {}
|
||||
|
||||
for fields in tqdm(samples):
|
||||
class_name = fields[class_name_key]
|
||||
audio_file = fields["audio_file"]
|
||||
embedding_key = fields["audio_unique_name"]
|
||||
|
||||
# Only update the speaker name when the embedding is already in the old file.
|
||||
if embedding_key in speaker_mapping:
|
||||
speaker_mapping[embedding_key]["name"] = class_name
|
||||
continue
|
||||
|
||||
if old_speakers_file is not None and embedding_key in encoder_manager.clip_ids:
|
||||
# get the embedding from the old file
|
||||
embedd = encoder_manager.get_embedding_by_clip(embedding_key)
|
||||
else:
|
||||
# extract the embedding
|
||||
embedd = encoder_manager.compute_embedding_from_clip(audio_file)
|
||||
|
||||
# create speaker_mapping if target dataset is defined
|
||||
speaker_mapping[embedding_key] = {}
|
||||
speaker_mapping[embedding_key]["name"] = class_name
|
||||
speaker_mapping[embedding_key]["embedding"] = embedd
|
||||
|
||||
if speaker_mapping:
|
||||
# save speaker_mapping if target dataset is defined
|
||||
if os.path.isdir(output_path):
|
||||
mapping_file_path = os.path.join(output_path, "speakers.pth")
|
||||
else:
|
||||
mapping_file_path = output_path
|
||||
|
||||
if os.path.dirname(mapping_file_path) != "":
|
||||
os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
|
||||
|
||||
save_file(speaker_mapping, mapping_file_path)
|
||||
print("Speaker embeddings saved at:", mapping_file_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
|
||||
"""
|
||||
Example runs:
|
||||
python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --config_dataset_path dataset_config.json
|
||||
|
||||
python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
|
||||
""",
|
||||
formatter_class=RawTextHelpFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_path",
|
||||
type=str,
|
||||
help="Path to model checkpoint file. It defaults to the released speaker encoder.",
|
||||
default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config_path",
|
||||
type=str,
|
||||
help="Path to model config file. It defaults to the released speaker encoder config.",
|
||||
default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config_dataset_path",
|
||||
type=str,
|
||||
help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_path",
|
||||
type=str,
|
||||
help="Path for output `pth` or `json` file.",
|
||||
default="speakers.pth",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--old_file",
|
||||
type=str,
|
||||
help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--old_append",
|
||||
help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False",
|
||||
default=False,
|
||||
action="store_true",
|
||||
)
|
||||
parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
|
||||
parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
|
||||
parser.add_argument(
|
||||
"--formatter_name",
|
||||
type=str,
|
||||
help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset_name",
|
||||
type=str,
|
||||
help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset_path",
|
||||
type=str,
|
||||
help="Path to the dataset. You either need to provide this or `config_dataset_path`",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--meta_file_train",
|
||||
type=str,
|
||||
help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--meta_file_val",
|
||||
type=str,
|
||||
help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
|
||||
default=None,
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
compute_embeddings(
|
||||
args.model_path,
|
||||
args.config_path,
|
||||
args.output_path,
|
||||
old_speakers_file=args.old_file,
|
||||
old_append=args.old_append,
|
||||
config_dataset_path=args.config_dataset_path,
|
||||
formatter_name=args.formatter_name,
|
||||
dataset_name=args.dataset_name,
|
||||
dataset_path=args.dataset_path,
|
||||
meta_file_train=args.meta_file_train,
|
||||
meta_file_val=args.meta_file_val,
|
||||
disable_cuda=args.disable_cuda,
|
||||
no_eval=args.no_eval,
|
||||
)
|
||||
@@ -0,0 +1,96 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
||||
# from TTS.utils.io import load_config
|
||||
from TTS.config import load_config
|
||||
from TTS.tts.datasets import load_tts_samples
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
|
||||
|
||||
def main():
|
||||
"""Run preprocessing process."""
|
||||
parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
|
||||
parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
|
||||
parser.add_argument("out_path", type=str, help="save path (directory and filename).")
|
||||
parser.add_argument(
|
||||
"--data_path",
|
||||
type=str,
|
||||
required=False,
|
||||
help="folder including the target set of wavs overriding dataset config.",
|
||||
)
|
||||
args, overrides = parser.parse_known_args()
|
||||
|
||||
CONFIG = load_config(args.config_path)
|
||||
CONFIG.parse_known_args(overrides, relaxed_parser=True)
|
||||
|
||||
# load config
|
||||
CONFIG.audio.signal_norm = False # do not apply earlier normalization
|
||||
CONFIG.audio.stats_path = None # discard pre-defined stats
|
||||
|
||||
# load audio processor
|
||||
ap = AudioProcessor(**CONFIG.audio.to_dict())
|
||||
|
||||
# load the meta data of target dataset
|
||||
if args.data_path:
|
||||
dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True)
|
||||
else:
|
||||
dataset_items = load_tts_samples(CONFIG.datasets)[0] # take only train data
|
||||
print(f" > There are {len(dataset_items)} files.")
|
||||
|
||||
mel_sum = 0
|
||||
mel_square_sum = 0
|
||||
linear_sum = 0
|
||||
linear_square_sum = 0
|
||||
N = 0
|
||||
for item in tqdm(dataset_items):
|
||||
# compute features
|
||||
wav = ap.load_wav(item if isinstance(item, str) else item["audio_file"])
|
||||
linear = ap.spectrogram(wav)
|
||||
mel = ap.melspectrogram(wav)
|
||||
|
||||
# compute stats
|
||||
N += mel.shape[1]
|
||||
mel_sum += mel.sum(1)
|
||||
linear_sum += linear.sum(1)
|
||||
mel_square_sum += (mel**2).sum(axis=1)
|
||||
linear_square_sum += (linear**2).sum(axis=1)
|
||||
|
||||
mel_mean = mel_sum / N
|
||||
mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2)
|
||||
linear_mean = linear_sum / N
|
||||
linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2)
|
||||
|
||||
output_file_path = args.out_path
|
||||
stats = {}
|
||||
stats["mel_mean"] = mel_mean
|
||||
stats["mel_std"] = mel_scale
|
||||
stats["linear_mean"] = linear_mean
|
||||
stats["linear_std"] = linear_scale
|
||||
|
||||
print(f" > Avg mel spec mean: {mel_mean.mean()}")
|
||||
print(f" > Avg mel spec scale: {mel_scale.mean()}")
|
||||
print(f" > Avg linear spec mean: {linear_mean.mean()}")
|
||||
print(f" > Avg linear spec scale: {linear_scale.mean()}")
|
||||
|
||||
# set default config values for mean-var scaling
|
||||
CONFIG.audio.stats_path = output_file_path
|
||||
CONFIG.audio.signal_norm = True
|
||||
# remove redundant values
|
||||
del CONFIG.audio.max_norm
|
||||
del CONFIG.audio.min_level_db
|
||||
del CONFIG.audio.symmetric_norm
|
||||
del CONFIG.audio.clip_norm
|
||||
stats["audio_config"] = CONFIG.audio.to_dict()
|
||||
np.save(output_file_path, stats, allow_pickle=True)
|
||||
print(f" > stats saved to {output_file_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,88 @@
|
||||
import argparse
|
||||
from argparse import RawTextHelpFormatter
|
||||
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from TTS.config import load_config
|
||||
from TTS.tts.datasets import load_tts_samples
|
||||
from TTS.tts.utils.speakers import SpeakerManager
|
||||
|
||||
|
||||
def compute_encoder_accuracy(dataset_items, encoder_manager):
|
||||
class_name_key = encoder_manager.encoder_config.class_name_key
|
||||
map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
|
||||
|
||||
class_acc_dict = {}
|
||||
|
||||
# compute embeddings for all wav_files
|
||||
for item in tqdm(dataset_items):
|
||||
class_name = item[class_name_key]
|
||||
wav_file = item["audio_file"]
|
||||
|
||||
# extract the embedding
|
||||
embedd = encoder_manager.compute_embedding_from_clip(wav_file)
|
||||
if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None:
|
||||
embedding = torch.FloatTensor(embedd).unsqueeze(0)
|
||||
if encoder_manager.use_cuda:
|
||||
embedding = embedding.cuda()
|
||||
|
||||
class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item()
|
||||
predicted_label = map_classid_to_classname[str(class_id)]
|
||||
else:
|
||||
predicted_label = None
|
||||
|
||||
if class_name is not None and predicted_label is not None:
|
||||
is_equal = int(class_name == predicted_label)
|
||||
if class_name not in class_acc_dict:
|
||||
class_acc_dict[class_name] = [is_equal]
|
||||
else:
|
||||
class_acc_dict[class_name].append(is_equal)
|
||||
else:
|
||||
raise RuntimeError("Error: class_name or/and predicted_label are None")
|
||||
|
||||
acc_avg = 0
|
||||
for key, values in class_acc_dict.items():
|
||||
acc = sum(values) / len(values)
|
||||
print("Class", key, "Accuracy:", acc)
|
||||
acc_avg += acc
|
||||
|
||||
print("Average Accuracy:", acc_avg / len(class_acc_dict))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="""Compute the accuracy of the encoder.\n\n"""
|
||||
"""
|
||||
Example runs:
|
||||
python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json dataset_config.json
|
||||
""",
|
||||
formatter_class=RawTextHelpFormatter,
|
||||
)
|
||||
parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
|
||||
parser.add_argument(
|
||||
"config_path",
|
||||
type=str,
|
||||
help="Path to model config file.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"config_dataset_path",
|
||||
type=str,
|
||||
help="Path to dataset config file.",
|
||||
)
|
||||
parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
|
||||
parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
c_dataset = load_config(args.config_dataset_path)
|
||||
|
||||
meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
|
||||
items = meta_data_train + meta_data_eval
|
||||
|
||||
enc_manager = SpeakerManager(
|
||||
encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
|
||||
)
|
||||
|
||||
compute_encoder_accuracy(items, enc_manager)
|
||||
@@ -0,0 +1,287 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Extract Mel spectrograms with teacher forcing."""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from tqdm import tqdm
|
||||
|
||||
from TTS.config import load_config
|
||||
from TTS.tts.datasets import TTSDataset, load_tts_samples
|
||||
from TTS.tts.models import setup_model
|
||||
from TTS.tts.utils.speakers import SpeakerManager
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.numpy_transforms import quantize
|
||||
from TTS.utils.generic_utils import count_parameters
|
||||
|
||||
use_cuda = torch.cuda.is_available()
|
||||
|
||||
|
||||
def setup_loader(ap, r, verbose=False):
|
||||
tokenizer, _ = TTSTokenizer.init_from_config(c)
|
||||
dataset = TTSDataset(
|
||||
outputs_per_step=r,
|
||||
compute_linear_spec=False,
|
||||
samples=meta_data,
|
||||
tokenizer=tokenizer,
|
||||
ap=ap,
|
||||
batch_group_size=0,
|
||||
min_text_len=c.min_text_len,
|
||||
max_text_len=c.max_text_len,
|
||||
min_audio_len=c.min_audio_len,
|
||||
max_audio_len=c.max_audio_len,
|
||||
phoneme_cache_path=c.phoneme_cache_path,
|
||||
precompute_num_workers=0,
|
||||
use_noise_augment=False,
|
||||
verbose=verbose,
|
||||
speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
|
||||
d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
|
||||
)
|
||||
|
||||
if c.use_phonemes and c.compute_input_seq_cache:
|
||||
# precompute phonemes to have a better estimate of sequence lengths.
|
||||
dataset.compute_input_seq(c.num_loader_workers)
|
||||
dataset.preprocess_samples()
|
||||
|
||||
loader = DataLoader(
|
||||
dataset,
|
||||
batch_size=c.batch_size,
|
||||
shuffle=False,
|
||||
collate_fn=dataset.collate_fn,
|
||||
drop_last=False,
|
||||
sampler=None,
|
||||
num_workers=c.num_loader_workers,
|
||||
pin_memory=False,
|
||||
)
|
||||
return loader
|
||||
|
||||
|
||||
def set_filename(wav_path, out_path):
|
||||
wav_file = os.path.basename(wav_path)
|
||||
file_name = wav_file.split(".")[0]
|
||||
os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
|
||||
os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
|
||||
os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True)
|
||||
os.makedirs(os.path.join(out_path, "wav"), exist_ok=True)
|
||||
wavq_path = os.path.join(out_path, "quant", file_name)
|
||||
mel_path = os.path.join(out_path, "mel", file_name)
|
||||
wav_gl_path = os.path.join(out_path, "wav_gl", file_name + ".wav")
|
||||
wav_path = os.path.join(out_path, "wav", file_name + ".wav")
|
||||
return file_name, wavq_path, mel_path, wav_gl_path, wav_path
|
||||
|
||||
|
||||
def format_data(data):
|
||||
# setup input data
|
||||
text_input = data["token_id"]
|
||||
text_lengths = data["token_id_lengths"]
|
||||
mel_input = data["mel"]
|
||||
mel_lengths = data["mel_lengths"]
|
||||
item_idx = data["item_idxs"]
|
||||
d_vectors = data["d_vectors"]
|
||||
speaker_ids = data["speaker_ids"]
|
||||
attn_mask = data["attns"]
|
||||
avg_text_length = torch.mean(text_lengths.float())
|
||||
avg_spec_length = torch.mean(mel_lengths.float())
|
||||
|
||||
# dispatch data to GPU
|
||||
if use_cuda:
|
||||
text_input = text_input.cuda(non_blocking=True)
|
||||
text_lengths = text_lengths.cuda(non_blocking=True)
|
||||
mel_input = mel_input.cuda(non_blocking=True)
|
||||
mel_lengths = mel_lengths.cuda(non_blocking=True)
|
||||
if speaker_ids is not None:
|
||||
speaker_ids = speaker_ids.cuda(non_blocking=True)
|
||||
if d_vectors is not None:
|
||||
d_vectors = d_vectors.cuda(non_blocking=True)
|
||||
if attn_mask is not None:
|
||||
attn_mask = attn_mask.cuda(non_blocking=True)
|
||||
return (
|
||||
text_input,
|
||||
text_lengths,
|
||||
mel_input,
|
||||
mel_lengths,
|
||||
speaker_ids,
|
||||
d_vectors,
|
||||
avg_text_length,
|
||||
avg_spec_length,
|
||||
attn_mask,
|
||||
item_idx,
|
||||
)
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def inference(
|
||||
model_name,
|
||||
model,
|
||||
ap,
|
||||
text_input,
|
||||
text_lengths,
|
||||
mel_input,
|
||||
mel_lengths,
|
||||
speaker_ids=None,
|
||||
d_vectors=None,
|
||||
):
|
||||
if model_name == "glow_tts":
|
||||
speaker_c = None
|
||||
if speaker_ids is not None:
|
||||
speaker_c = speaker_ids
|
||||
elif d_vectors is not None:
|
||||
speaker_c = d_vectors
|
||||
outputs = model.inference_with_MAS(
|
||||
text_input,
|
||||
text_lengths,
|
||||
mel_input,
|
||||
mel_lengths,
|
||||
aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids},
|
||||
)
|
||||
model_output = outputs["model_outputs"]
|
||||
model_output = model_output.detach().cpu().numpy()
|
||||
|
||||
elif "tacotron" in model_name:
|
||||
aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
|
||||
outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input)
|
||||
postnet_outputs = outputs["model_outputs"]
|
||||
# normalize tacotron output
|
||||
if model_name == "tacotron":
|
||||
mel_specs = []
|
||||
postnet_outputs = postnet_outputs.data.cpu().numpy()
|
||||
for b in range(postnet_outputs.shape[0]):
|
||||
postnet_output = postnet_outputs[b]
|
||||
mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T))
|
||||
model_output = torch.stack(mel_specs).cpu().numpy()
|
||||
|
||||
elif model_name == "tacotron2":
|
||||
model_output = postnet_outputs.detach().cpu().numpy()
|
||||
return model_output
|
||||
|
||||
|
||||
def extract_spectrograms(
|
||||
data_loader, model, ap, output_path, quantize_bits=0, save_audio=False, debug=False, metada_name="metada.txt"
|
||||
):
|
||||
model.eval()
|
||||
export_metadata = []
|
||||
for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
|
||||
# format data
|
||||
(
|
||||
text_input,
|
||||
text_lengths,
|
||||
mel_input,
|
||||
mel_lengths,
|
||||
speaker_ids,
|
||||
d_vectors,
|
||||
_,
|
||||
_,
|
||||
_,
|
||||
item_idx,
|
||||
) = format_data(data)
|
||||
|
||||
model_output = inference(
|
||||
c.model.lower(),
|
||||
model,
|
||||
ap,
|
||||
text_input,
|
||||
text_lengths,
|
||||
mel_input,
|
||||
mel_lengths,
|
||||
speaker_ids,
|
||||
d_vectors,
|
||||
)
|
||||
|
||||
for idx in range(text_input.shape[0]):
|
||||
wav_file_path = item_idx[idx]
|
||||
wav = ap.load_wav(wav_file_path)
|
||||
_, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
|
||||
|
||||
# quantize and save wav
|
||||
if quantize_bits > 0:
|
||||
wavq = quantize(wav, quantize_bits)
|
||||
np.save(wavq_path, wavq)
|
||||
|
||||
# save TTS mel
|
||||
mel = model_output[idx]
|
||||
mel_length = mel_lengths[idx]
|
||||
mel = mel[:mel_length, :].T
|
||||
np.save(mel_path, mel)
|
||||
|
||||
export_metadata.append([wav_file_path, mel_path])
|
||||
if save_audio:
|
||||
ap.save_wav(wav, wav_path)
|
||||
|
||||
if debug:
|
||||
print("Audio for debug saved at:", wav_gl_path)
|
||||
wav = ap.inv_melspectrogram(mel)
|
||||
ap.save_wav(wav, wav_gl_path)
|
||||
|
||||
with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f:
|
||||
for data in export_metadata:
|
||||
f.write(f"{data[0]}|{data[1]+'.npy'}\n")
|
||||
|
||||
|
||||
def main(args): # pylint: disable=redefined-outer-name
|
||||
# pylint: disable=global-variable-undefined
|
||||
global meta_data, speaker_manager
|
||||
|
||||
# Audio processor
|
||||
ap = AudioProcessor(**c.audio)
|
||||
|
||||
# load data instances
|
||||
meta_data_train, meta_data_eval = load_tts_samples(
|
||||
c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
|
||||
)
|
||||
|
||||
# use eval and training partitions
|
||||
meta_data = meta_data_train + meta_data_eval
|
||||
|
||||
# init speaker manager
|
||||
if c.use_speaker_embedding:
|
||||
speaker_manager = SpeakerManager(data_items=meta_data)
|
||||
elif c.use_d_vector_file:
|
||||
speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file)
|
||||
else:
|
||||
speaker_manager = None
|
||||
|
||||
# setup model
|
||||
model = setup_model(c)
|
||||
|
||||
# restore model
|
||||
model.load_checkpoint(c, args.checkpoint_path, eval=True)
|
||||
|
||||
if use_cuda:
|
||||
model.cuda()
|
||||
|
||||
num_params = count_parameters(model)
|
||||
print("\n > Model has {} parameters".format(num_params), flush=True)
|
||||
# set r
|
||||
r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
|
||||
own_loader = setup_loader(ap, r, verbose=True)
|
||||
|
||||
extract_spectrograms(
|
||||
own_loader,
|
||||
model,
|
||||
ap,
|
||||
args.output_path,
|
||||
quantize_bits=args.quantize_bits,
|
||||
save_audio=args.save_audio,
|
||||
debug=args.debug,
|
||||
metada_name="metada.txt",
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
|
||||
parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
|
||||
parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
|
||||
parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
|
||||
parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
|
||||
parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
|
||||
parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
c = load_config(args.config_path)
|
||||
c.audio.trim_silence = False
|
||||
main(args)
|
||||
@@ -0,0 +1,45 @@
|
||||
"""Find all the unique characters in a dataset"""
|
||||
import argparse
|
||||
from argparse import RawTextHelpFormatter
|
||||
|
||||
from TTS.config import load_config
|
||||
from TTS.tts.datasets import load_tts_samples
|
||||
|
||||
|
||||
def main():
|
||||
# pylint: disable=bad-option-value
|
||||
parser = argparse.ArgumentParser(
|
||||
description="""Find all the unique characters or phonemes in a dataset.\n\n"""
|
||||
"""
|
||||
Example runs:
|
||||
|
||||
python TTS/bin/find_unique_chars.py --config_path config.json
|
||||
""",
|
||||
formatter_class=RawTextHelpFormatter,
|
||||
)
|
||||
parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
c = load_config(args.config_path)
|
||||
|
||||
# load all datasets
|
||||
train_items, eval_items = load_tts_samples(
|
||||
c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
|
||||
)
|
||||
|
||||
items = train_items + eval_items
|
||||
|
||||
texts = "".join(item["text"] for item in items)
|
||||
chars = set(texts)
|
||||
lower_chars = filter(lambda c: c.islower(), chars)
|
||||
chars_force_lower = [c.lower() for c in chars]
|
||||
chars_force_lower = set(chars_force_lower)
|
||||
|
||||
print(f" > Number of unique characters: {len(chars)}")
|
||||
print(f" > Unique characters: {''.join(sorted(chars))}")
|
||||
print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
|
||||
print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,74 @@
|
||||
"""Find all the unique characters in a dataset"""
|
||||
import argparse
|
||||
import multiprocessing
|
||||
from argparse import RawTextHelpFormatter
|
||||
|
||||
from tqdm.contrib.concurrent import process_map
|
||||
|
||||
from TTS.config import load_config
|
||||
from TTS.tts.datasets import load_tts_samples
|
||||
from TTS.tts.utils.text.phonemizers import Gruut
|
||||
|
||||
|
||||
def compute_phonemes(item):
|
||||
text = item["text"]
|
||||
ph = phonemizer.phonemize(text).replace("|", "")
|
||||
return set(list(ph))
|
||||
|
||||
|
||||
def main():
|
||||
# pylint: disable=W0601
|
||||
global c, phonemizer
|
||||
# pylint: disable=bad-option-value
|
||||
parser = argparse.ArgumentParser(
|
||||
description="""Find all the unique characters or phonemes in a dataset.\n\n"""
|
||||
"""
|
||||
Example runs:
|
||||
|
||||
python TTS/bin/find_unique_phonemes.py --config_path config.json
|
||||
""",
|
||||
formatter_class=RawTextHelpFormatter,
|
||||
)
|
||||
parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
c = load_config(args.config_path)
|
||||
|
||||
# load all datasets
|
||||
train_items, eval_items = load_tts_samples(
|
||||
c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
|
||||
)
|
||||
items = train_items + eval_items
|
||||
print("Num items:", len(items))
|
||||
|
||||
language_list = [item["language"] for item in items]
|
||||
is_lang_def = all(language_list)
|
||||
|
||||
if not c.phoneme_language or not is_lang_def:
|
||||
raise ValueError("Phoneme language must be defined in config.")
|
||||
|
||||
if not language_list.count(language_list[0]) == len(language_list):
|
||||
raise ValueError(
|
||||
"Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
|
||||
)
|
||||
|
||||
phonemizer = Gruut(language=language_list[0], keep_puncs=True)
|
||||
|
||||
phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
|
||||
phones = []
|
||||
for ph in phonemes:
|
||||
phones.extend(ph)
|
||||
|
||||
phones = set(phones)
|
||||
lower_phones = filter(lambda c: c.islower(), phones)
|
||||
phones_force_lower = [c.lower() for c in phones]
|
||||
phones_force_lower = set(phones_force_lower)
|
||||
|
||||
print(f" > Number of unique phonemes: {len(phones)}")
|
||||
print(f" > Unique phonemes: {''.join(sorted(phones))}")
|
||||
print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
|
||||
print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,124 @@
|
||||
import argparse
|
||||
import glob
|
||||
import multiprocessing
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from TTS.utils.vad import get_vad_model_and_utils, remove_silence
|
||||
|
||||
torch.set_num_threads(1)
|
||||
|
||||
|
||||
def adjust_path_and_remove_silence(audio_path):
|
||||
output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
|
||||
# ignore if the file exists
|
||||
if os.path.exists(output_path) and not args.force:
|
||||
return output_path, False
|
||||
|
||||
# create all directory structure
|
||||
pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
# remove the silence and save the audio
|
||||
output_path, is_speech = remove_silence(
|
||||
model_and_utils,
|
||||
audio_path,
|
||||
output_path,
|
||||
trim_just_beginning_and_end=args.trim_just_beginning_and_end,
|
||||
use_cuda=args.use_cuda,
|
||||
)
|
||||
return output_path, is_speech
|
||||
|
||||
|
||||
def preprocess_audios():
|
||||
files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
|
||||
print("> Number of files: ", len(files))
|
||||
if not args.force:
|
||||
print("> Ignoring files that already exist in the output idrectory.")
|
||||
|
||||
if args.trim_just_beginning_and_end:
|
||||
print("> Trimming just the beginning and the end with nonspeech parts.")
|
||||
else:
|
||||
print("> Trimming all nonspeech parts.")
|
||||
|
||||
filtered_files = []
|
||||
if files:
|
||||
# create threads
|
||||
# num_threads = multiprocessing.cpu_count()
|
||||
# process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
|
||||
|
||||
if args.num_processes > 1:
|
||||
with multiprocessing.Pool(processes=args.num_processes) as pool:
|
||||
results = list(
|
||||
tqdm(
|
||||
pool.imap_unordered(adjust_path_and_remove_silence, files),
|
||||
total=len(files),
|
||||
desc="Processing audio files",
|
||||
)
|
||||
)
|
||||
for output_path, is_speech in results:
|
||||
if not is_speech:
|
||||
filtered_files.append(output_path)
|
||||
else:
|
||||
for f in tqdm(files):
|
||||
output_path, is_speech = adjust_path_and_remove_silence(f)
|
||||
if not is_speech:
|
||||
filtered_files.append(output_path)
|
||||
|
||||
# write files that do not have speech
|
||||
with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
|
||||
for file in filtered_files:
|
||||
f.write(str(file) + "\n")
|
||||
else:
|
||||
print("> No files Found !")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
|
||||
)
|
||||
parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
|
||||
parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
|
||||
parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
|
||||
parser.add_argument(
|
||||
"-g",
|
||||
"--glob",
|
||||
type=str,
|
||||
default="**/*.wav",
|
||||
help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--trim_just_beginning_and_end",
|
||||
type=bool,
|
||||
default=True,
|
||||
help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c",
|
||||
"--use_cuda",
|
||||
type=bool,
|
||||
default=False,
|
||||
help="If True use cuda",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use_onnx",
|
||||
type=bool,
|
||||
default=False,
|
||||
help="If True use onnx",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num_processes",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of processes to use",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.output_dir == "":
|
||||
args.output_dir = args.input_dir
|
||||
|
||||
# load the model and utils
|
||||
model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda, use_onnx=args.use_onnx)
|
||||
preprocess_audios()
|
||||
@@ -0,0 +1,90 @@
|
||||
import argparse
|
||||
import glob
|
||||
import os
|
||||
from argparse import RawTextHelpFormatter
|
||||
from multiprocessing import Pool
|
||||
from shutil import copytree
|
||||
|
||||
import librosa
|
||||
import soundfile as sf
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def resample_file(func_args):
|
||||
filename, output_sr = func_args
|
||||
y, sr = librosa.load(filename, sr=output_sr)
|
||||
sf.write(filename, y, sr)
|
||||
|
||||
|
||||
def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs=10):
|
||||
if output_dir:
|
||||
print("Recursively copying the input folder...")
|
||||
copytree(input_dir, output_dir)
|
||||
input_dir = output_dir
|
||||
|
||||
print("Resampling the audio files...")
|
||||
audio_files = glob.glob(os.path.join(input_dir, f"**/*.{file_ext}"), recursive=True)
|
||||
print(f"Found {len(audio_files)} files...")
|
||||
audio_files = list(zip(audio_files, len(audio_files) * [output_sr]))
|
||||
with Pool(processes=n_jobs) as p:
|
||||
with tqdm(total=len(audio_files)) as pbar:
|
||||
for _, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
|
||||
pbar.update()
|
||||
|
||||
print("Done !")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="""Resample a folder recusively with librosa
|
||||
Can be used in place or create a copy of the folder as an output.\n\n
|
||||
Example run:
|
||||
python TTS/bin/resample.py
|
||||
--input_dir /root/LJSpeech-1.1/
|
||||
--output_sr 22050
|
||||
--output_dir /root/resampled_LJSpeech-1.1/
|
||||
--file_ext wav
|
||||
--n_jobs 24
|
||||
""",
|
||||
formatter_class=RawTextHelpFormatter,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--input_dir",
|
||||
type=str,
|
||||
default=None,
|
||||
required=True,
|
||||
help="Path of the folder containing the audio files to resample",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--output_sr",
|
||||
type=int,
|
||||
default=22050,
|
||||
required=False,
|
||||
help="Samlple rate to which the audio files should be resampled",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
type=str,
|
||||
default=None,
|
||||
required=False,
|
||||
help="Path of the destination folder. If not defined, the operation is done in place",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--file_ext",
|
||||
type=str,
|
||||
default="wav",
|
||||
required=False,
|
||||
help="Extension of the audio files to resample",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
resample_files(args.input_dir, args.output_sr, args.output_dir, args.file_ext, args.n_jobs)
|
||||
@@ -0,0 +1,494 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import argparse
|
||||
import contextlib
|
||||
import sys
|
||||
from argparse import RawTextHelpFormatter
|
||||
|
||||
# pylint: disable=redefined-outer-name, unused-argument
|
||||
from pathlib import Path
|
||||
|
||||
description = """
|
||||
Synthesize speech on command line.
|
||||
|
||||
You can either use your trained model or choose a model from the provided list.
|
||||
|
||||
If you don't specify any models, then it uses LJSpeech based English model.
|
||||
|
||||
#### Single Speaker Models
|
||||
|
||||
- List provided models:
|
||||
|
||||
```
|
||||
$ tts --list_models
|
||||
```
|
||||
|
||||
- Get model info (for both tts_models and vocoder_models):
|
||||
|
||||
- Query by type/name:
|
||||
The model_info_by_name uses the name as it from the --list_models.
|
||||
```
|
||||
$ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
|
||||
```
|
||||
For example:
|
||||
```
|
||||
$ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
|
||||
$ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
|
||||
```
|
||||
- Query by type/idx:
|
||||
The model_query_idx uses the corresponding idx from --list_models.
|
||||
|
||||
```
|
||||
$ tts --model_info_by_idx "<model_type>/<model_query_idx>"
|
||||
```
|
||||
|
||||
For example:
|
||||
|
||||
```
|
||||
$ tts --model_info_by_idx tts_models/3
|
||||
```
|
||||
|
||||
- Query info for model info by full name:
|
||||
```
|
||||
$ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
|
||||
```
|
||||
|
||||
- Run TTS with default models:
|
||||
|
||||
```
|
||||
$ tts --text "Text for TTS" --out_path output/path/speech.wav
|
||||
```
|
||||
|
||||
- Run TTS and pipe out the generated TTS wav file data:
|
||||
|
||||
```
|
||||
$ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
|
||||
```
|
||||
|
||||
- Run a TTS model with its default vocoder model:
|
||||
|
||||
```
|
||||
$ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
|
||||
```
|
||||
|
||||
For example:
|
||||
|
||||
```
|
||||
$ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
|
||||
```
|
||||
|
||||
- Run with specific TTS and vocoder models from the list:
|
||||
|
||||
```
|
||||
$ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
|
||||
```
|
||||
|
||||
For example:
|
||||
|
||||
```
|
||||
$ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
|
||||
```
|
||||
|
||||
- Run your own TTS model (Using Griffin-Lim Vocoder):
|
||||
|
||||
```
|
||||
$ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
|
||||
```
|
||||
|
||||
- Run your own TTS and Vocoder models:
|
||||
|
||||
```
|
||||
$ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
|
||||
--vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
|
||||
```
|
||||
|
||||
#### Multi-speaker Models
|
||||
|
||||
- List the available speakers and choose a <speaker_id> among them:
|
||||
|
||||
```
|
||||
$ tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
|
||||
```
|
||||
|
||||
- Run the multi-speaker TTS model with the target speaker ID:
|
||||
|
||||
```
|
||||
$ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
|
||||
```
|
||||
|
||||
- Run your own multi-speaker TTS model:
|
||||
|
||||
```
|
||||
$ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
|
||||
```
|
||||
|
||||
### Voice Conversion Models
|
||||
|
||||
```
|
||||
$ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
def str2bool(v):
|
||||
if isinstance(v, bool):
|
||||
return v
|
||||
if v.lower() in ("yes", "true", "t", "y", "1"):
|
||||
return True
|
||||
if v.lower() in ("no", "false", "f", "n", "0"):
|
||||
return False
|
||||
raise argparse.ArgumentTypeError("Boolean value expected.")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description=description.replace(" ```\n", ""),
|
||||
formatter_class=RawTextHelpFormatter,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--list_models",
|
||||
type=str2bool,
|
||||
nargs="?",
|
||||
const=True,
|
||||
default=False,
|
||||
help="list available pre-trained TTS and vocoder models.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--model_info_by_idx",
|
||||
type=str,
|
||||
default=None,
|
||||
help="model info using query format: <model_type>/<model_query_idx>",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--model_info_by_name",
|
||||
type=str,
|
||||
default=None,
|
||||
help="model info using query format: <model_type>/<language>/<dataset>/<model_name>",
|
||||
)
|
||||
|
||||
parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")
|
||||
|
||||
# Args for running pre-trained TTS models.
|
||||
parser.add_argument(
|
||||
"--model_name",
|
||||
type=str,
|
||||
default="tts_models/en/ljspeech/tacotron2-DDC",
|
||||
help="Name of one of the pre-trained TTS models in format <language>/<dataset>/<model_name>",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--vocoder_name",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Name of one of the pre-trained vocoder models in format <language>/<dataset>/<model_name>",
|
||||
)
|
||||
|
||||
# Args for running custom models
|
||||
parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
|
||||
parser.add_argument(
|
||||
"--model_path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to model file.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--out_path",
|
||||
type=str,
|
||||
default="tts_output.wav",
|
||||
help="Output wav file path.",
|
||||
)
|
||||
parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
|
||||
parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
|
||||
parser.add_argument(
|
||||
"--vocoder_path",
|
||||
type=str,
|
||||
help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
|
||||
parser.add_argument(
|
||||
"--encoder_path",
|
||||
type=str,
|
||||
help="Path to speaker encoder model file.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
|
||||
parser.add_argument(
|
||||
"--pipe_out",
|
||||
help="stdout the generated TTS wav file for shell pipe.",
|
||||
type=str2bool,
|
||||
nargs="?",
|
||||
const=True,
|
||||
default=False,
|
||||
)
|
||||
|
||||
# args for multi-speaker synthesis
|
||||
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
|
||||
parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
|
||||
parser.add_argument(
|
||||
"--speaker_idx",
|
||||
type=str,
|
||||
help="Target speaker ID for a multi-speaker TTS model.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--language_idx",
|
||||
type=str,
|
||||
help="Target language ID for a multi-lingual TTS model.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--speaker_wav",
|
||||
nargs="+",
|
||||
help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
|
||||
parser.add_argument(
|
||||
"--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
|
||||
)
|
||||
parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
|
||||
parser.add_argument(
|
||||
"--list_speaker_idxs",
|
||||
help="List available speaker ids for the defined multi-speaker model.",
|
||||
type=str2bool,
|
||||
nargs="?",
|
||||
const=True,
|
||||
default=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--list_language_idxs",
|
||||
help="List available language ids for the defined multi-lingual model.",
|
||||
type=str2bool,
|
||||
nargs="?",
|
||||
const=True,
|
||||
default=False,
|
||||
)
|
||||
# aux args
|
||||
parser.add_argument(
|
||||
"--save_spectogram",
|
||||
type=bool,
|
||||
help="If true save raw spectogram for further (vocoder) processing in out_path.",
|
||||
default=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--reference_wav",
|
||||
type=str,
|
||||
help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--reference_speaker_idx",
|
||||
type=str,
|
||||
help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--progress_bar",
|
||||
type=str2bool,
|
||||
help="If true shows a progress bar for the model download. Defaults to True",
|
||||
default=True,
|
||||
)
|
||||
|
||||
# voice conversion args
|
||||
parser.add_argument(
|
||||
"--source_wav",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Original audio file to convert in the voice of the target_wav",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--target_wav",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Target audio file to convert in the voice of the source_wav",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--voice_dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Voice dir for tortoise model",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# print the description if either text or list_models is not set
|
||||
check_args = [
|
||||
args.text,
|
||||
args.list_models,
|
||||
args.list_speaker_idxs,
|
||||
args.list_language_idxs,
|
||||
args.reference_wav,
|
||||
args.model_info_by_idx,
|
||||
args.model_info_by_name,
|
||||
args.source_wav,
|
||||
args.target_wav,
|
||||
]
|
||||
if not any(check_args):
|
||||
parser.parse_args(["-h"])
|
||||
|
||||
pipe_out = sys.stdout if args.pipe_out else None
|
||||
|
||||
with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
|
||||
# Late-import to make things load faster
|
||||
from TTS.api import TTS
|
||||
from TTS.utils.manage import ModelManager
|
||||
from TTS.utils.synthesizer import Synthesizer
|
||||
|
||||
# load model manager
|
||||
path = Path(__file__).parent / "../.models.json"
|
||||
manager = ModelManager(path, progress_bar=args.progress_bar)
|
||||
api = TTS()
|
||||
|
||||
tts_path = None
|
||||
tts_config_path = None
|
||||
speakers_file_path = None
|
||||
language_ids_file_path = None
|
||||
vocoder_path = None
|
||||
vocoder_config_path = None
|
||||
encoder_path = None
|
||||
encoder_config_path = None
|
||||
vc_path = None
|
||||
vc_config_path = None
|
||||
model_dir = None
|
||||
|
||||
# CASE1 #list : list pre-trained TTS models
|
||||
if args.list_models:
|
||||
manager.list_models()
|
||||
sys.exit()
|
||||
|
||||
# CASE2 #info : model info for pre-trained TTS models
|
||||
if args.model_info_by_idx:
|
||||
model_query = args.model_info_by_idx
|
||||
manager.model_info_by_idx(model_query)
|
||||
sys.exit()
|
||||
|
||||
if args.model_info_by_name:
|
||||
model_query_full_name = args.model_info_by_name
|
||||
manager.model_info_by_full_name(model_query_full_name)
|
||||
sys.exit()
|
||||
|
||||
# CASE3: load pre-trained model paths
|
||||
if args.model_name is not None and not args.model_path:
|
||||
model_path, config_path, model_item = manager.download_model(args.model_name)
|
||||
# tts model
|
||||
if model_item["model_type"] == "tts_models":
|
||||
tts_path = model_path
|
||||
tts_config_path = config_path
|
||||
if "default_vocoder" in model_item:
|
||||
args.vocoder_name = (
|
||||
model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
|
||||
)
|
||||
|
||||
# voice conversion model
|
||||
if model_item["model_type"] == "voice_conversion_models":
|
||||
vc_path = model_path
|
||||
vc_config_path = config_path
|
||||
|
||||
# tts model with multiple files to be loaded from the directory path
|
||||
if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list):
|
||||
model_dir = model_path
|
||||
tts_path = None
|
||||
tts_config_path = None
|
||||
args.vocoder_name = None
|
||||
|
||||
# load vocoder
|
||||
if args.vocoder_name is not None and not args.vocoder_path:
|
||||
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
|
||||
|
||||
# CASE4: set custom model paths
|
||||
if args.model_path is not None:
|
||||
tts_path = args.model_path
|
||||
tts_config_path = args.config_path
|
||||
speakers_file_path = args.speakers_file_path
|
||||
language_ids_file_path = args.language_ids_file_path
|
||||
|
||||
if args.vocoder_path is not None:
|
||||
vocoder_path = args.vocoder_path
|
||||
vocoder_config_path = args.vocoder_config_path
|
||||
|
||||
if args.encoder_path is not None:
|
||||
encoder_path = args.encoder_path
|
||||
encoder_config_path = args.encoder_config_path
|
||||
|
||||
device = args.device
|
||||
if args.use_cuda:
|
||||
device = "cuda"
|
||||
|
||||
# load models
|
||||
synthesizer = Synthesizer(
|
||||
tts_path,
|
||||
tts_config_path,
|
||||
speakers_file_path,
|
||||
language_ids_file_path,
|
||||
vocoder_path,
|
||||
vocoder_config_path,
|
||||
encoder_path,
|
||||
encoder_config_path,
|
||||
vc_path,
|
||||
vc_config_path,
|
||||
model_dir,
|
||||
args.voice_dir,
|
||||
).to(device)
|
||||
|
||||
# query speaker ids of a multi-speaker model.
|
||||
if args.list_speaker_idxs:
|
||||
print(
|
||||
" > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
|
||||
)
|
||||
print(synthesizer.tts_model.speaker_manager.name_to_id)
|
||||
return
|
||||
|
||||
# query langauge ids of a multi-lingual model.
|
||||
if args.list_language_idxs:
|
||||
print(
|
||||
" > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
|
||||
)
|
||||
print(synthesizer.tts_model.language_manager.name_to_id)
|
||||
return
|
||||
|
||||
# check the arguments against a multi-speaker model.
|
||||
if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
|
||||
print(
|
||||
" [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
|
||||
"select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
|
||||
)
|
||||
return
|
||||
|
||||
# RUN THE SYNTHESIS
|
||||
if args.text:
|
||||
print(" > Text: {}".format(args.text))
|
||||
|
||||
# kick it
|
||||
if tts_path is not None:
|
||||
wav = synthesizer.tts(
|
||||
args.text,
|
||||
speaker_name=args.speaker_idx,
|
||||
language_name=args.language_idx,
|
||||
speaker_wav=args.speaker_wav,
|
||||
reference_wav=args.reference_wav,
|
||||
style_wav=args.capacitron_style_wav,
|
||||
style_text=args.capacitron_style_text,
|
||||
reference_speaker_name=args.reference_speaker_idx,
|
||||
)
|
||||
elif vc_path is not None:
|
||||
wav = synthesizer.voice_conversion(
|
||||
source_wav=args.source_wav,
|
||||
target_wav=args.target_wav,
|
||||
)
|
||||
elif model_dir is not None:
|
||||
wav = synthesizer.tts(
|
||||
args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
|
||||
)
|
||||
|
||||
# save the results
|
||||
print(" > Saving output to {}".format(args.out_path))
|
||||
synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,332 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from trainer.io import copy_model_files, save_best_model, save_checkpoint
|
||||
from trainer.torch import NoamLR
|
||||
from trainer.trainer_utils import get_optimizer
|
||||
|
||||
from TTS.encoder.dataset import EncoderDataset
|
||||
from TTS.encoder.utils.generic_utils import setup_encoder_model
|
||||
from TTS.encoder.utils.training import init_training
|
||||
from TTS.encoder.utils.visual import plot_embeddings
|
||||
from TTS.tts.datasets import load_tts_samples
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
|
||||
from TTS.utils.samplers import PerfectBatchSampler
|
||||
from TTS.utils.training import check_update
|
||||
|
||||
torch.backends.cudnn.enabled = True
|
||||
torch.backends.cudnn.benchmark = True
|
||||
torch.manual_seed(54321)
|
||||
use_cuda = torch.cuda.is_available()
|
||||
num_gpus = torch.cuda.device_count()
|
||||
print(" > Using CUDA: ", use_cuda)
|
||||
print(" > Number of GPUs: ", num_gpus)
|
||||
|
||||
|
||||
def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False):
|
||||
num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
|
||||
num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
|
||||
|
||||
dataset = EncoderDataset(
|
||||
c,
|
||||
ap,
|
||||
meta_data_eval if is_val else meta_data_train,
|
||||
voice_len=c.voice_len,
|
||||
num_utter_per_class=num_utter_per_class,
|
||||
num_classes_in_batch=num_classes_in_batch,
|
||||
verbose=verbose,
|
||||
augmentation_config=c.audio_augmentation if not is_val else None,
|
||||
use_torch_spec=c.model_params.get("use_torch_spec", False),
|
||||
)
|
||||
# get classes list
|
||||
classes = dataset.get_class_list()
|
||||
|
||||
sampler = PerfectBatchSampler(
|
||||
dataset.items,
|
||||
classes,
|
||||
batch_size=num_classes_in_batch * num_utter_per_class, # total batch size
|
||||
num_classes_in_batch=num_classes_in_batch,
|
||||
num_gpus=1,
|
||||
shuffle=not is_val,
|
||||
drop_last=True,
|
||||
)
|
||||
|
||||
if len(classes) < num_classes_in_batch:
|
||||
if is_val:
|
||||
raise RuntimeError(
|
||||
f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !"
|
||||
)
|
||||
raise RuntimeError(
|
||||
f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !"
|
||||
)
|
||||
|
||||
# set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
|
||||
if is_val:
|
||||
dataset.set_classes(train_classes)
|
||||
|
||||
loader = DataLoader(
|
||||
dataset,
|
||||
num_workers=c.num_loader_workers,
|
||||
batch_sampler=sampler,
|
||||
collate_fn=dataset.collate_fn,
|
||||
)
|
||||
|
||||
return loader, classes, dataset.get_map_classid_to_classname()
|
||||
|
||||
|
||||
def evaluation(model, criterion, data_loader, global_step):
|
||||
eval_loss = 0
|
||||
for _, data in enumerate(data_loader):
|
||||
with torch.no_grad():
|
||||
# setup input data
|
||||
inputs, labels = data
|
||||
|
||||
# agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
|
||||
labels = torch.transpose(
|
||||
labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1
|
||||
).reshape(labels.shape)
|
||||
inputs = torch.transpose(
|
||||
inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1
|
||||
).reshape(inputs.shape)
|
||||
|
||||
# dispatch data to GPU
|
||||
if use_cuda:
|
||||
inputs = inputs.cuda(non_blocking=True)
|
||||
labels = labels.cuda(non_blocking=True)
|
||||
|
||||
# forward pass model
|
||||
outputs = model(inputs)
|
||||
|
||||
# loss computation
|
||||
loss = criterion(
|
||||
outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels
|
||||
)
|
||||
|
||||
eval_loss += loss.item()
|
||||
|
||||
eval_avg_loss = eval_loss / len(data_loader)
|
||||
# save stats
|
||||
dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
|
||||
# plot the last batch in the evaluation
|
||||
figures = {
|
||||
"UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
|
||||
}
|
||||
dashboard_logger.eval_figures(global_step, figures)
|
||||
return eval_avg_loss
|
||||
|
||||
|
||||
def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
|
||||
model.train()
|
||||
best_loss = {"train_loss": None, "eval_loss": float("inf")}
|
||||
avg_loader_time = 0
|
||||
end_time = time.time()
|
||||
for epoch in range(c.epochs):
|
||||
tot_loss = 0
|
||||
epoch_time = 0
|
||||
for _, data in enumerate(data_loader):
|
||||
start_time = time.time()
|
||||
|
||||
# setup input data
|
||||
inputs, labels = data
|
||||
# agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
|
||||
labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(
|
||||
labels.shape
|
||||
)
|
||||
inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(
|
||||
inputs.shape
|
||||
)
|
||||
# ToDo: move it to a unit test
|
||||
# labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
|
||||
# inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
|
||||
# idx = 0
|
||||
# for j in range(0, c.num_classes_in_batch, 1):
|
||||
# for i in range(j, len(labels), c.num_classes_in_batch):
|
||||
# if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])):
|
||||
# print("Invalid")
|
||||
# print(labels)
|
||||
# exit()
|
||||
# idx += 1
|
||||
# labels = labels_converted
|
||||
# inputs = inputs_converted
|
||||
|
||||
loader_time = time.time() - end_time
|
||||
global_step += 1
|
||||
|
||||
# setup lr
|
||||
if c.lr_decay:
|
||||
scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
# dispatch data to GPU
|
||||
if use_cuda:
|
||||
inputs = inputs.cuda(non_blocking=True)
|
||||
labels = labels.cuda(non_blocking=True)
|
||||
|
||||
# forward pass model
|
||||
outputs = model(inputs)
|
||||
|
||||
# loss computation
|
||||
loss = criterion(
|
||||
outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels
|
||||
)
|
||||
loss.backward()
|
||||
grad_norm, _ = check_update(model, c.grad_clip)
|
||||
optimizer.step()
|
||||
|
||||
step_time = time.time() - start_time
|
||||
epoch_time += step_time
|
||||
|
||||
# acumulate the total epoch loss
|
||||
tot_loss += loss.item()
|
||||
|
||||
# Averaged Loader Time
|
||||
num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1
|
||||
avg_loader_time = (
|
||||
1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time
|
||||
if avg_loader_time != 0
|
||||
else loader_time
|
||||
)
|
||||
current_lr = optimizer.param_groups[0]["lr"]
|
||||
|
||||
if global_step % c.steps_plot_stats == 0:
|
||||
# Plot Training Epoch Stats
|
||||
train_stats = {
|
||||
"loss": loss.item(),
|
||||
"lr": current_lr,
|
||||
"grad_norm": grad_norm,
|
||||
"step_time": step_time,
|
||||
"avg_loader_time": avg_loader_time,
|
||||
}
|
||||
dashboard_logger.train_epoch_stats(global_step, train_stats)
|
||||
figures = {
|
||||
"UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
|
||||
}
|
||||
dashboard_logger.train_figures(global_step, figures)
|
||||
|
||||
if global_step % c.print_step == 0:
|
||||
print(
|
||||
" | > Step:{} Loss:{:.5f} GradNorm:{:.5f} "
|
||||
"StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format(
|
||||
global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr
|
||||
),
|
||||
flush=True,
|
||||
)
|
||||
|
||||
if global_step % c.save_step == 0:
|
||||
# save model
|
||||
save_checkpoint(
|
||||
c, model, optimizer, None, global_step, epoch, OUT_PATH, criterion=criterion.state_dict()
|
||||
)
|
||||
|
||||
end_time = time.time()
|
||||
|
||||
print("")
|
||||
print(
|
||||
">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} "
|
||||
"EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
|
||||
epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
|
||||
),
|
||||
flush=True,
|
||||
)
|
||||
# evaluation
|
||||
if c.run_eval:
|
||||
model.eval()
|
||||
eval_loss = evaluation(model, criterion, eval_data_loader, global_step)
|
||||
print("\n\n")
|
||||
print("--> EVAL PERFORMANCE")
|
||||
print(
|
||||
" | > Epoch:{} AvgLoss: {:.5f} ".format(epoch, eval_loss),
|
||||
flush=True,
|
||||
)
|
||||
# save the best checkpoint
|
||||
best_loss = save_best_model(
|
||||
{"train_loss": None, "eval_loss": eval_loss},
|
||||
best_loss,
|
||||
c,
|
||||
model,
|
||||
optimizer,
|
||||
None,
|
||||
global_step,
|
||||
epoch,
|
||||
OUT_PATH,
|
||||
criterion=criterion.state_dict(),
|
||||
)
|
||||
model.train()
|
||||
|
||||
return best_loss, global_step
|
||||
|
||||
|
||||
def main(args): # pylint: disable=redefined-outer-name
|
||||
# pylint: disable=global-variable-undefined
|
||||
global meta_data_train
|
||||
global meta_data_eval
|
||||
global train_classes
|
||||
|
||||
ap = AudioProcessor(**c.audio)
|
||||
model = setup_encoder_model(c)
|
||||
|
||||
optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model)
|
||||
|
||||
# pylint: disable=redefined-outer-name
|
||||
meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
|
||||
|
||||
train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True)
|
||||
if c.run_eval:
|
||||
eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True)
|
||||
else:
|
||||
eval_data_loader = None
|
||||
|
||||
num_classes = len(train_classes)
|
||||
criterion = model.get_criterion(c, num_classes)
|
||||
|
||||
if c.loss == "softmaxproto" and c.model != "speaker_encoder":
|
||||
c.map_classid_to_classname = map_classid_to_classname
|
||||
copy_model_files(c, OUT_PATH, new_fields={})
|
||||
|
||||
if args.restore_path:
|
||||
criterion, args.restore_step = model.load_checkpoint(
|
||||
c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
|
||||
)
|
||||
print(" > Model restored from step %d" % args.restore_step, flush=True)
|
||||
else:
|
||||
args.restore_step = 0
|
||||
|
||||
if c.lr_decay:
|
||||
scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1)
|
||||
else:
|
||||
scheduler = None
|
||||
|
||||
num_params = count_parameters(model)
|
||||
print("\n > Model has {} parameters".format(num_params), flush=True)
|
||||
|
||||
if use_cuda:
|
||||
model = model.cuda()
|
||||
criterion.cuda()
|
||||
|
||||
global_step = args.restore_step
|
||||
_, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
|
||||
|
||||
try:
|
||||
main(args)
|
||||
except KeyboardInterrupt:
|
||||
remove_experiment_folder(OUT_PATH)
|
||||
try:
|
||||
sys.exit(0)
|
||||
except SystemExit:
|
||||
os._exit(0) # pylint: disable=protected-access
|
||||
except Exception: # pylint: disable=broad-except
|
||||
remove_experiment_folder(OUT_PATH)
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
@@ -0,0 +1,71 @@
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from trainer import Trainer, TrainerArgs
|
||||
|
||||
from TTS.config import load_config, register_config
|
||||
from TTS.tts.datasets import load_tts_samples
|
||||
from TTS.tts.models import setup_model
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrainTTSArgs(TrainerArgs):
|
||||
config_path: str = field(default=None, metadata={"help": "Path to the config file."})
|
||||
|
||||
|
||||
def main():
|
||||
"""Run `tts` model training directly by a `config.json` file."""
|
||||
# init trainer args
|
||||
train_args = TrainTTSArgs()
|
||||
parser = train_args.init_argparse(arg_prefix="")
|
||||
|
||||
# override trainer args from comman-line args
|
||||
args, config_overrides = parser.parse_known_args()
|
||||
train_args.parse_args(args)
|
||||
|
||||
# load config.json and register
|
||||
if args.config_path or args.continue_path:
|
||||
if args.config_path:
|
||||
# init from a file
|
||||
config = load_config(args.config_path)
|
||||
if len(config_overrides) > 0:
|
||||
config.parse_known_args(config_overrides, relaxed_parser=True)
|
||||
elif args.continue_path:
|
||||
# continue from a prev experiment
|
||||
config = load_config(os.path.join(args.continue_path, "config.json"))
|
||||
if len(config_overrides) > 0:
|
||||
config.parse_known_args(config_overrides, relaxed_parser=True)
|
||||
else:
|
||||
# init from console args
|
||||
from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
|
||||
|
||||
config_base = BaseTrainingConfig()
|
||||
config_base.parse_known_args(config_overrides)
|
||||
config = register_config(config_base.model)()
|
||||
|
||||
# load training samples
|
||||
train_samples, eval_samples = load_tts_samples(
|
||||
config.datasets,
|
||||
eval_split=True,
|
||||
eval_split_max_size=config.eval_split_max_size,
|
||||
eval_split_size=config.eval_split_size,
|
||||
)
|
||||
|
||||
# init the model from config
|
||||
model = setup_model(config, train_samples + eval_samples)
|
||||
|
||||
# init the trainer and 🚀
|
||||
trainer = Trainer(
|
||||
train_args,
|
||||
model.config,
|
||||
config.output_path,
|
||||
model=model,
|
||||
train_samples=train_samples,
|
||||
eval_samples=eval_samples,
|
||||
parse_command_line_args=False,
|
||||
)
|
||||
trainer.fit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,77 @@
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from trainer import Trainer, TrainerArgs
|
||||
|
||||
from TTS.config import load_config, register_config
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
|
||||
from TTS.vocoder.models import setup_model
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrainVocoderArgs(TrainerArgs):
|
||||
config_path: str = field(default=None, metadata={"help": "Path to the config file."})
|
||||
|
||||
|
||||
def main():
|
||||
"""Run `tts` model training directly by a `config.json` file."""
|
||||
# init trainer args
|
||||
train_args = TrainVocoderArgs()
|
||||
parser = train_args.init_argparse(arg_prefix="")
|
||||
|
||||
# override trainer args from comman-line args
|
||||
args, config_overrides = parser.parse_known_args()
|
||||
train_args.parse_args(args)
|
||||
|
||||
# load config.json and register
|
||||
if args.config_path or args.continue_path:
|
||||
if args.config_path:
|
||||
# init from a file
|
||||
config = load_config(args.config_path)
|
||||
if len(config_overrides) > 0:
|
||||
config.parse_known_args(config_overrides, relaxed_parser=True)
|
||||
elif args.continue_path:
|
||||
# continue from a prev experiment
|
||||
config = load_config(os.path.join(args.continue_path, "config.json"))
|
||||
if len(config_overrides) > 0:
|
||||
config.parse_known_args(config_overrides, relaxed_parser=True)
|
||||
else:
|
||||
# init from console args
|
||||
from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
|
||||
|
||||
config_base = BaseTrainingConfig()
|
||||
config_base.parse_known_args(config_overrides)
|
||||
config = register_config(config_base.model)()
|
||||
|
||||
# load training samples
|
||||
if "feature_path" in config and config.feature_path:
|
||||
# load pre-computed features
|
||||
print(f" > Loading features from: {config.feature_path}")
|
||||
eval_samples, train_samples = load_wav_feat_data(config.data_path, config.feature_path, config.eval_split_size)
|
||||
else:
|
||||
# load data raw wav files
|
||||
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
|
||||
|
||||
# setup audio processor
|
||||
ap = AudioProcessor(**config.audio)
|
||||
|
||||
# init the model from config
|
||||
model = setup_model(config)
|
||||
|
||||
# init the trainer and 🚀
|
||||
trainer = Trainer(
|
||||
train_args,
|
||||
config,
|
||||
config.output_path,
|
||||
model=model,
|
||||
train_samples=train_samples,
|
||||
eval_samples=eval_samples,
|
||||
training_assets={"audio_processor": ap},
|
||||
parse_command_line_args=False,
|
||||
)
|
||||
trainer.fit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,103 @@
|
||||
"""Search a good noise schedule for WaveGrad for a given number of inference iterations"""
|
||||
import argparse
|
||||
from itertools import product as cartesian_product
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from tqdm import tqdm
|
||||
|
||||
from TTS.config import load_config
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.vocoder.datasets.preprocess import load_wav_data
|
||||
from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
|
||||
from TTS.vocoder.models import setup_model
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
|
||||
parser.add_argument("--config_path", type=str, help="Path to model config file.")
|
||||
parser.add_argument("--data_path", type=str, help="Path to data directory.")
|
||||
parser.add_argument("--output_path", type=str, help="path for output file including file name and extension.")
|
||||
parser.add_argument(
|
||||
"--num_iter",
|
||||
type=int,
|
||||
help="Number of model inference iterations that you like to optimize noise schedule for.",
|
||||
)
|
||||
parser.add_argument("--use_cuda", action="store_true", help="enable CUDA.")
|
||||
parser.add_argument("--num_samples", type=int, default=1, help="Number of datasamples used for inference.")
|
||||
parser.add_argument(
|
||||
"--search_depth",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Search granularity. Increasing this increases the run-time exponentially.",
|
||||
)
|
||||
|
||||
# load config
|
||||
args = parser.parse_args()
|
||||
config = load_config(args.config_path)
|
||||
|
||||
# setup audio processor
|
||||
ap = AudioProcessor(**config.audio)
|
||||
|
||||
# load dataset
|
||||
_, train_data = load_wav_data(args.data_path, 0)
|
||||
train_data = train_data[: args.num_samples]
|
||||
dataset = WaveGradDataset(
|
||||
ap=ap,
|
||||
items=train_data,
|
||||
seq_len=-1,
|
||||
hop_len=ap.hop_length,
|
||||
pad_short=config.pad_short,
|
||||
conv_pad=config.conv_pad,
|
||||
is_training=True,
|
||||
return_segments=False,
|
||||
use_noise_augment=False,
|
||||
use_cache=False,
|
||||
verbose=True,
|
||||
)
|
||||
loader = DataLoader(
|
||||
dataset,
|
||||
batch_size=1,
|
||||
shuffle=False,
|
||||
collate_fn=dataset.collate_full_clips,
|
||||
drop_last=False,
|
||||
num_workers=config.num_loader_workers,
|
||||
pin_memory=False,
|
||||
)
|
||||
|
||||
# setup the model
|
||||
model = setup_model(config)
|
||||
if args.use_cuda:
|
||||
model.cuda()
|
||||
|
||||
# setup optimization parameters
|
||||
base_values = sorted(10 * np.random.uniform(size=args.search_depth))
|
||||
print(f" > base values: {base_values}")
|
||||
exponents = 10 ** np.linspace(-6, -1, num=args.num_iter)
|
||||
best_error = float("inf")
|
||||
best_schedule = None # pylint: disable=C0103
|
||||
total_search_iter = len(base_values) ** args.num_iter
|
||||
for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=total_search_iter):
|
||||
beta = exponents * base
|
||||
model.compute_noise_level(beta)
|
||||
for data in loader:
|
||||
mel, audio = data
|
||||
y_hat = model.inference(mel.cuda() if args.use_cuda else mel)
|
||||
|
||||
if args.use_cuda:
|
||||
y_hat = y_hat.cpu()
|
||||
y_hat = y_hat.numpy()
|
||||
|
||||
mel_hat = []
|
||||
for i in range(y_hat.shape[0]):
|
||||
m = ap.melspectrogram(y_hat[i, 0])[:, :-1]
|
||||
mel_hat.append(torch.from_numpy(m))
|
||||
|
||||
mel_hat = torch.stack(mel_hat)
|
||||
mse = torch.sum((mel - mel_hat) ** 2).mean()
|
||||
if mse.item() < best_error:
|
||||
best_error = mse.item()
|
||||
best_schedule = {"beta": beta}
|
||||
print(f" > Found a better schedule. - MSE: {mse.item()}")
|
||||
np.save(args.output_path, best_schedule)
|
||||
Reference in New Issue
Block a user