Add files via upload
This commit is contained in:
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,136 @@
|
||||
import glob
|
||||
import os
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
from scipy import signal
|
||||
|
||||
from TTS.encoder.models.lstm import LSTMSpeakerEncoder
|
||||
from TTS.encoder.models.resnet import ResNetSpeakerEncoder
|
||||
|
||||
|
||||
class AugmentWAV(object):
|
||||
def __init__(self, ap, augmentation_config):
|
||||
self.ap = ap
|
||||
self.use_additive_noise = False
|
||||
|
||||
if "additive" in augmentation_config.keys():
|
||||
self.additive_noise_config = augmentation_config["additive"]
|
||||
additive_path = self.additive_noise_config["sounds_path"]
|
||||
if additive_path:
|
||||
self.use_additive_noise = True
|
||||
# get noise types
|
||||
self.additive_noise_types = []
|
||||
for key in self.additive_noise_config.keys():
|
||||
if isinstance(self.additive_noise_config[key], dict):
|
||||
self.additive_noise_types.append(key)
|
||||
|
||||
additive_files = glob.glob(os.path.join(additive_path, "**/*.wav"), recursive=True)
|
||||
|
||||
self.noise_list = {}
|
||||
|
||||
for wav_file in additive_files:
|
||||
noise_dir = wav_file.replace(additive_path, "").split(os.sep)[0]
|
||||
# ignore not listed directories
|
||||
if noise_dir not in self.additive_noise_types:
|
||||
continue
|
||||
if not noise_dir in self.noise_list:
|
||||
self.noise_list[noise_dir] = []
|
||||
self.noise_list[noise_dir].append(wav_file)
|
||||
|
||||
print(
|
||||
f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}"
|
||||
)
|
||||
|
||||
self.use_rir = False
|
||||
|
||||
if "rir" in augmentation_config.keys():
|
||||
self.rir_config = augmentation_config["rir"]
|
||||
if self.rir_config["rir_path"]:
|
||||
self.rir_files = glob.glob(os.path.join(self.rir_config["rir_path"], "**/*.wav"), recursive=True)
|
||||
self.use_rir = True
|
||||
|
||||
print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances")
|
||||
|
||||
self.create_augmentation_global_list()
|
||||
|
||||
def create_augmentation_global_list(self):
|
||||
if self.use_additive_noise:
|
||||
self.global_noise_list = self.additive_noise_types
|
||||
else:
|
||||
self.global_noise_list = []
|
||||
if self.use_rir:
|
||||
self.global_noise_list.append("RIR_AUG")
|
||||
|
||||
def additive_noise(self, noise_type, audio):
|
||||
clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4)
|
||||
|
||||
noise_list = random.sample(
|
||||
self.noise_list[noise_type],
|
||||
random.randint(
|
||||
self.additive_noise_config[noise_type]["min_num_noises"],
|
||||
self.additive_noise_config[noise_type]["max_num_noises"],
|
||||
),
|
||||
)
|
||||
|
||||
audio_len = audio.shape[0]
|
||||
noises_wav = None
|
||||
for noise in noise_list:
|
||||
noiseaudio = self.ap.load_wav(noise, sr=self.ap.sample_rate)[:audio_len]
|
||||
|
||||
if noiseaudio.shape[0] < audio_len:
|
||||
continue
|
||||
|
||||
noise_snr = random.uniform(
|
||||
self.additive_noise_config[noise_type]["min_snr_in_db"],
|
||||
self.additive_noise_config[noise_type]["max_num_noises"],
|
||||
)
|
||||
noise_db = 10 * np.log10(np.mean(noiseaudio**2) + 1e-4)
|
||||
noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio
|
||||
|
||||
if noises_wav is None:
|
||||
noises_wav = noise_wav
|
||||
else:
|
||||
noises_wav += noise_wav
|
||||
|
||||
# if all possible files is less than audio, choose other files
|
||||
if noises_wav is None:
|
||||
return self.additive_noise(noise_type, audio)
|
||||
|
||||
return audio + noises_wav
|
||||
|
||||
def reverberate(self, audio):
|
||||
audio_len = audio.shape[0]
|
||||
|
||||
rir_file = random.choice(self.rir_files)
|
||||
rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate)
|
||||
rir = rir / np.sqrt(np.sum(rir**2))
|
||||
return signal.convolve(audio, rir, mode=self.rir_config["conv_mode"])[:audio_len]
|
||||
|
||||
def apply_one(self, audio):
|
||||
noise_type = random.choice(self.global_noise_list)
|
||||
if noise_type == "RIR_AUG":
|
||||
return self.reverberate(audio)
|
||||
|
||||
return self.additive_noise(noise_type, audio)
|
||||
|
||||
|
||||
def setup_encoder_model(config: "Coqpit"):
|
||||
if config.model_params["model_name"].lower() == "lstm":
|
||||
model = LSTMSpeakerEncoder(
|
||||
config.model_params["input_dim"],
|
||||
config.model_params["proj_dim"],
|
||||
config.model_params["lstm_dim"],
|
||||
config.model_params["num_lstm_layers"],
|
||||
use_torch_spec=config.model_params.get("use_torch_spec", False),
|
||||
audio_config=config.audio,
|
||||
)
|
||||
elif config.model_params["model_name"].lower() == "resnet":
|
||||
model = ResNetSpeakerEncoder(
|
||||
input_dim=config.model_params["input_dim"],
|
||||
proj_dim=config.model_params["proj_dim"],
|
||||
log_input=config.model_params.get("log_input", False),
|
||||
use_torch_spec=config.model_params.get("use_torch_spec", False),
|
||||
audio_config=config.audio,
|
||||
)
|
||||
return model
|
||||
@@ -0,0 +1,219 @@
|
||||
# coding=utf-8
|
||||
# Copyright (C) 2020 ATHENA AUTHORS; Yiping Peng; Ne Luo
|
||||
# All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
# Only support eager mode and TF>=2.0.0
|
||||
# pylint: disable=no-member, invalid-name, relative-beyond-top-level
|
||||
# pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes
|
||||
""" voxceleb 1 & 2 """
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import zipfile
|
||||
|
||||
import pandas
|
||||
import soundfile as sf
|
||||
from absl import logging
|
||||
|
||||
SUBSETS = {
|
||||
"vox1_dev_wav": [
|
||||
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa",
|
||||
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partab",
|
||||
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partac",
|
||||
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partad",
|
||||
],
|
||||
"vox1_test_wav": ["https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip"],
|
||||
"vox2_dev_aac": [
|
||||
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaa",
|
||||
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partab",
|
||||
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partac",
|
||||
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partad",
|
||||
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partae",
|
||||
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaf",
|
||||
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partag",
|
||||
"https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partah",
|
||||
],
|
||||
"vox2_test_aac": ["https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_test_aac.zip"],
|
||||
}
|
||||
|
||||
MD5SUM = {
|
||||
"vox1_dev_wav": "ae63e55b951748cc486645f532ba230b",
|
||||
"vox2_dev_aac": "bbc063c46078a602ca71605645c2a402",
|
||||
"vox1_test_wav": "185fdc63c3c739954633d50379a3d102",
|
||||
"vox2_test_aac": "0d2b3ea430a821c33263b5ea37ede312",
|
||||
}
|
||||
|
||||
USER = {"user": "", "password": ""}
|
||||
|
||||
speaker_id_dict = {}
|
||||
|
||||
|
||||
def download_and_extract(directory, subset, urls):
|
||||
"""Download and extract the given split of dataset.
|
||||
|
||||
Args:
|
||||
directory: the directory where to put the downloaded data.
|
||||
subset: subset name of the corpus.
|
||||
urls: the list of urls to download the data file.
|
||||
"""
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
|
||||
try:
|
||||
for url in urls:
|
||||
zip_filepath = os.path.join(directory, url.split("/")[-1])
|
||||
if os.path.exists(zip_filepath):
|
||||
continue
|
||||
logging.info("Downloading %s to %s" % (url, zip_filepath))
|
||||
subprocess.call(
|
||||
"wget %s --user %s --password %s -O %s" % (url, USER["user"], USER["password"], zip_filepath),
|
||||
shell=True,
|
||||
)
|
||||
|
||||
statinfo = os.stat(zip_filepath)
|
||||
logging.info("Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size))
|
||||
|
||||
# concatenate all parts into zip files
|
||||
if ".zip" not in zip_filepath:
|
||||
zip_filepath = "_".join(zip_filepath.split("_")[:-1])
|
||||
subprocess.call("cat %s* > %s.zip" % (zip_filepath, zip_filepath), shell=True)
|
||||
zip_filepath += ".zip"
|
||||
extract_path = zip_filepath.strip(".zip")
|
||||
|
||||
# check zip file md5sum
|
||||
with open(zip_filepath, "rb") as f_zip:
|
||||
md5 = hashlib.md5(f_zip.read()).hexdigest()
|
||||
if md5 != MD5SUM[subset]:
|
||||
raise ValueError("md5sum of %s mismatch" % zip_filepath)
|
||||
|
||||
with zipfile.ZipFile(zip_filepath, "r") as zfile:
|
||||
zfile.extractall(directory)
|
||||
extract_path_ori = os.path.join(directory, zfile.infolist()[0].filename)
|
||||
subprocess.call("mv %s %s" % (extract_path_ori, extract_path), shell=True)
|
||||
finally:
|
||||
# os.remove(zip_filepath)
|
||||
pass
|
||||
|
||||
|
||||
def exec_cmd(cmd):
|
||||
"""Run a command in a subprocess.
|
||||
Args:
|
||||
cmd: command line to be executed.
|
||||
Return:
|
||||
int, the return code.
|
||||
"""
|
||||
try:
|
||||
retcode = subprocess.call(cmd, shell=True)
|
||||
if retcode < 0:
|
||||
logging.info(f"Child was terminated by signal {retcode}")
|
||||
except OSError as e:
|
||||
logging.info(f"Execution failed: {e}")
|
||||
retcode = -999
|
||||
return retcode
|
||||
|
||||
|
||||
def decode_aac_with_ffmpeg(aac_file, wav_file):
|
||||
"""Decode a given AAC file into WAV using ffmpeg.
|
||||
Args:
|
||||
aac_file: file path to input AAC file.
|
||||
wav_file: file path to output WAV file.
|
||||
Return:
|
||||
bool, True if success.
|
||||
"""
|
||||
cmd = f"ffmpeg -i {aac_file} {wav_file}"
|
||||
logging.info(f"Decoding aac file using command line: {cmd}")
|
||||
ret = exec_cmd(cmd)
|
||||
if ret != 0:
|
||||
logging.error(f"Failed to decode aac file with retcode {ret}")
|
||||
logging.error("Please check your ffmpeg installation.")
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def convert_audio_and_make_label(input_dir, subset, output_dir, output_file):
|
||||
"""Optionally convert AAC to WAV and make speaker labels.
|
||||
Args:
|
||||
input_dir: the directory which holds the input dataset.
|
||||
subset: the name of the specified subset. e.g. vox1_dev_wav
|
||||
output_dir: the directory to place the newly generated csv files.
|
||||
output_file: the name of the newly generated csv file. e.g. vox1_dev_wav.csv
|
||||
"""
|
||||
|
||||
logging.info("Preprocessing audio and label for subset %s" % subset)
|
||||
source_dir = os.path.join(input_dir, subset)
|
||||
|
||||
files = []
|
||||
# Convert all AAC file into WAV format. At the same time, generate the csv
|
||||
for root, _, filenames in os.walk(source_dir):
|
||||
for filename in filenames:
|
||||
name, ext = os.path.splitext(filename)
|
||||
if ext.lower() == ".wav":
|
||||
_, ext2 = os.path.splitext(name)
|
||||
if ext2:
|
||||
continue
|
||||
wav_file = os.path.join(root, filename)
|
||||
elif ext.lower() == ".m4a":
|
||||
# Convert AAC to WAV.
|
||||
aac_file = os.path.join(root, filename)
|
||||
wav_file = aac_file + ".wav"
|
||||
if not os.path.exists(wav_file):
|
||||
if not decode_aac_with_ffmpeg(aac_file, wav_file):
|
||||
raise RuntimeError("Audio decoding failed.")
|
||||
else:
|
||||
continue
|
||||
speaker_name = root.split(os.path.sep)[-2]
|
||||
if speaker_name not in speaker_id_dict:
|
||||
num = len(speaker_id_dict)
|
||||
speaker_id_dict[speaker_name] = num
|
||||
# wav_filesize = os.path.getsize(wav_file)
|
||||
wav_length = len(sf.read(wav_file)[0])
|
||||
files.append((os.path.abspath(wav_file), wav_length, speaker_id_dict[speaker_name], speaker_name))
|
||||
|
||||
# Write to CSV file which contains four columns:
|
||||
# "wav_filename", "wav_length_ms", "speaker_id", "speaker_name".
|
||||
csv_file_path = os.path.join(output_dir, output_file)
|
||||
df = pandas.DataFrame(data=files, columns=["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"])
|
||||
df.to_csv(csv_file_path, index=False, sep="\t")
|
||||
logging.info("Successfully generated csv file {}".format(csv_file_path))
|
||||
|
||||
|
||||
def processor(directory, subset, force_process):
|
||||
"""download and process"""
|
||||
urls = SUBSETS
|
||||
if subset not in urls:
|
||||
raise ValueError(subset, "is not in voxceleb")
|
||||
|
||||
subset_csv = os.path.join(directory, subset + ".csv")
|
||||
if not force_process and os.path.exists(subset_csv):
|
||||
return subset_csv
|
||||
|
||||
logging.info("Downloading and process the voxceleb in %s", directory)
|
||||
logging.info("Preparing subset %s", subset)
|
||||
download_and_extract(directory, subset, urls[subset])
|
||||
convert_audio_and_make_label(directory, subset, directory, subset + ".csv")
|
||||
logging.info("Finished downloading and processing")
|
||||
return subset_csv
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.set_verbosity(logging.INFO)
|
||||
if len(sys.argv) != 4:
|
||||
print("Usage: python prepare_data.py save_directory user password")
|
||||
sys.exit()
|
||||
|
||||
DIR, USER["user"], USER["password"] = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||
for SUBSET in SUBSETS:
|
||||
processor(DIR, SUBSET, False)
|
||||
@@ -0,0 +1,99 @@
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from coqpit import Coqpit
|
||||
from trainer import TrainerArgs, get_last_checkpoint
|
||||
from trainer.io import copy_model_files
|
||||
from trainer.logging import logger_factory
|
||||
from trainer.logging.console_logger import ConsoleLogger
|
||||
|
||||
from TTS.config import load_config, register_config
|
||||
from TTS.tts.utils.text.characters import parse_symbols
|
||||
from TTS.utils.generic_utils import get_experiment_folder_path, get_git_branch
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrainArgs(TrainerArgs):
|
||||
config_path: str = field(default=None, metadata={"help": "Path to the config file."})
|
||||
|
||||
|
||||
def getarguments():
|
||||
train_config = TrainArgs()
|
||||
parser = train_config.init_argparse(arg_prefix="")
|
||||
return parser
|
||||
|
||||
|
||||
def process_args(args, config=None):
|
||||
"""Process parsed comand line arguments and initialize the config if not provided.
|
||||
Args:
|
||||
args (argparse.Namespace or dict like): Parsed input arguments.
|
||||
config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None.
|
||||
Returns:
|
||||
c (TTS.utils.io.AttrDict): Config paramaters.
|
||||
out_path (str): Path to save models and logging.
|
||||
audio_path (str): Path to save generated test audios.
|
||||
c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does
|
||||
logging to the console.
|
||||
dashboard_logger (WandbLogger or TensorboardLogger): Class that does the dashboard Logging
|
||||
TODO:
|
||||
- Interactive config definition.
|
||||
"""
|
||||
if isinstance(args, tuple):
|
||||
args, coqpit_overrides = args
|
||||
if args.continue_path:
|
||||
# continue a previous training from its output folder
|
||||
experiment_path = args.continue_path
|
||||
args.config_path = os.path.join(args.continue_path, "config.json")
|
||||
args.restore_path, best_model = get_last_checkpoint(args.continue_path)
|
||||
if not args.best_path:
|
||||
args.best_path = best_model
|
||||
# init config if not already defined
|
||||
if config is None:
|
||||
if args.config_path:
|
||||
# init from a file
|
||||
config = load_config(args.config_path)
|
||||
else:
|
||||
# init from console args
|
||||
from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
|
||||
|
||||
config_base = BaseTrainingConfig()
|
||||
config_base.parse_known_args(coqpit_overrides)
|
||||
config = register_config(config_base.model)()
|
||||
# override values from command-line args
|
||||
config.parse_known_args(coqpit_overrides, relaxed_parser=True)
|
||||
experiment_path = args.continue_path
|
||||
if not experiment_path:
|
||||
experiment_path = get_experiment_folder_path(config.output_path, config.run_name)
|
||||
audio_path = os.path.join(experiment_path, "test_audios")
|
||||
config.output_log_path = experiment_path
|
||||
# setup rank 0 process in distributed training
|
||||
dashboard_logger = None
|
||||
if args.rank == 0:
|
||||
new_fields = {}
|
||||
if args.restore_path:
|
||||
new_fields["restore_path"] = args.restore_path
|
||||
new_fields["github_branch"] = get_git_branch()
|
||||
# if model characters are not set in the config file
|
||||
# save the default set to the config file for future
|
||||
# compatibility.
|
||||
if config.has("characters") and config.characters is None:
|
||||
used_characters = parse_symbols()
|
||||
new_fields["characters"] = used_characters
|
||||
copy_model_files(config, experiment_path, new_fields)
|
||||
dashboard_logger = logger_factory(config, experiment_path)
|
||||
c_logger = ConsoleLogger()
|
||||
return config, experiment_path, audio_path, c_logger, dashboard_logger
|
||||
|
||||
|
||||
def init_arguments():
|
||||
train_config = TrainArgs()
|
||||
parser = train_config.init_argparse(arg_prefix="")
|
||||
return parser
|
||||
|
||||
|
||||
def init_training(config: Coqpit = None):
|
||||
"""Initialization of a training run."""
|
||||
parser = init_arguments()
|
||||
args = parser.parse_known_args()
|
||||
config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = process_args(args, config)
|
||||
return args[0], config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger
|
||||
@@ -0,0 +1,50 @@
|
||||
import matplotlib
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import umap
|
||||
|
||||
matplotlib.use("Agg")
|
||||
|
||||
|
||||
colormap = (
|
||||
np.array(
|
||||
[
|
||||
[76, 255, 0],
|
||||
[0, 127, 70],
|
||||
[255, 0, 0],
|
||||
[255, 217, 38],
|
||||
[0, 135, 255],
|
||||
[165, 0, 165],
|
||||
[255, 167, 255],
|
||||
[0, 255, 255],
|
||||
[255, 96, 38],
|
||||
[142, 76, 0],
|
||||
[33, 0, 127],
|
||||
[0, 0, 0],
|
||||
[183, 183, 183],
|
||||
],
|
||||
dtype=float,
|
||||
)
|
||||
/ 255
|
||||
)
|
||||
|
||||
|
||||
def plot_embeddings(embeddings, num_classes_in_batch):
|
||||
num_utter_per_class = embeddings.shape[0] // num_classes_in_batch
|
||||
|
||||
# if necessary get just the first 10 classes
|
||||
if num_classes_in_batch > 10:
|
||||
num_classes_in_batch = 10
|
||||
embeddings = embeddings[: num_classes_in_batch * num_utter_per_class]
|
||||
|
||||
model = umap.UMAP()
|
||||
projection = model.fit_transform(embeddings)
|
||||
ground_truth = np.repeat(np.arange(num_classes_in_batch), num_utter_per_class)
|
||||
colors = [colormap[i] for i in ground_truth]
|
||||
fig, ax = plt.subplots(figsize=(16, 10))
|
||||
_ = ax.scatter(projection[:, 0], projection[:, 1], c=colors)
|
||||
plt.gca().set_aspect("equal", "datalim")
|
||||
plt.title("UMAP projection")
|
||||
plt.tight_layout()
|
||||
plt.savefig("umap")
|
||||
return fig
|
||||
Reference in New Issue
Block a user