Add files via upload

This commit is contained in:
Sam Khoze
2024-06-18 19:43:44 -07:00
committed by GitHub
parent 7d608044ef
commit 69cd493d03
97 changed files with 5916 additions and 0 deletions
View File
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1 @@
{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[STOP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SPACE]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[STOP]":0,"[UNK]":1,"[SPACE]":2,"!":3,"'":4,"(":5,")":6,",":7,"-":8,".":9,"/":10,":":11,";":12,"?":13,"a":14,"b":15,"c":16,"d":17,"e":18,"f":19,"g":20,"h":21,"i":22,"j":23,"k":24,"l":25,"m":26,"n":27,"o":28,"p":29,"q":30,"r":31,"s":32,"t":33,"u":34,"v":35,"w":36,"x":37,"y":38,"z":39,"th":40,"in":41,"the":42,"an":43,"er":44,"ou":45,"re":46,"on":47,"at":48,"ed":49,"en":50,"to":51,"ing":52,"and":53,"is":54,"as":55,"al":56,"or":57,"of":58,"ar":59,"it":60,"es":61,"he":62,"st":63,"le":64,"om":65,"se":66,"be":67,"ad":68,"ow":69,"ly":70,"ch":71,"wh":72,"that":73,"you":74,"li":75,"ve":76,"ac":77,"ti":78,"ld":79,"me":80,"was":81,"gh":82,"id":83,"ll":84,"wi":85,"ent":86,"for":87,"ay":88,"ro":89,"ver":90,"ic":91,"her":92,"ke":93,"his":94,"no":95,"ut":96,"un":97,"ir":98,"lo":99,"we":100,"ri":101,"ha":102,"with":103,"ght":104,"out":105,"im":106,"ion":107,"all":108,"ab":109,"one":110,"ne":111,"ge":112,"ould":113,"ter":114,"mo":115,"had":116,"ce":117,"she":118,"go":119,"sh":120,"ur":121,"am":122,"so":123,"pe":124,"my":125,"de":126,"are":127,"but":128,"ome":129,"fr":130,"ther":131,"fe":132,"su":133,"do":134,"con":135,"te":136,"ain":137,"ere":138,"po":139,"if":140,"they":141,"us":142,"ag":143,"tr":144,"now":145,"oun":146,"this":147,"have":148,"not":149,"sa":150,"il":151,"up":152,"thing":153,"from":154,"ap":155,"him":156,"ack":157,"ation":158,"ant":159,"our":160,"op":161,"like":162,"ust":163,"ess":164,"bo":165,"ok":166,"ul":167,"ind":168,"ex":169,"com":170,"some":171,"there":172,"ers":173,"co":174,"res":175,"man":176,"ard":177,"pl":178,"wor":179,"way":180,"tion":181,"fo":182,"ca":183,"were":184,"by":185,"ate":186,"pro":187,"ted":188,"ound":189,"own":190,"would":191,"ts":192,"what":193,"qu":194,"ally":195,"ight":196,"ck":197,"gr":198,"when":199,"ven":200,"can":201,"ough":202,"ine":203,"end":204,"per":205,"ous":206,"od":207,"ide":208,"know":209,"ty":210,"very":211,"si":212,"ak":213,"who":214,"about":215,"ill":216,"them":217,"est":218,"red":219,"ye":220,"could":221,"ong":222,"your":223,"their":224,"em":225,"just":226,"other":227,"into":228,"any":229,"whi":230,"um":231,"tw":232,"ast":233,"der":234,"did":235,"ie":236,"been":237,"ace":238,"ink":239,"ity":240,"back":241,"ting":242,"br":243,"more":244,"ake":245,"pp":246,"then":247,"sp":248,"el":249,"use":250,"bl":251,"said":252,"over":253,"get":254},"merges":["t h","i n","th e","a n","e r","o u","r e","o n","a t","e d","e n","t o","in g","an d","i s","a s","a l","o r","o f","a r","i t","e s","h e","s t","l e","o m","s e","b e","a d","o w","l y","c h","w h","th at","y ou","l i","v e","a c","t i","l d","m e","w as","g h","i d","l l","w i","en t","f or","a y","r o","v er","i c","h er","k e","h is","n o","u t","u n","i r","l o","w e","r i","h a","wi th","gh t","ou t","i m","i on","al l","a b","on e","n e","g e","ou ld","t er","m o","h ad","c e","s he","g o","s h","u r","a m","s o","p e","m y","d e","a re","b ut","om e","f r","the r","f e","s u","d o","c on","t e","a in","er e","p o","i f","the y","u s","a g","t r","n ow","ou n","th is","ha ve","no t","s a","i l","u p","th ing","fr om","a p","h im","ac k","at ion","an t","ou r","o p","li ke","u st","es s","b o","o k","u l","in d","e x","c om","s ome","the re","er s","c o","re s","m an","ar d","p l","w or","w ay","ti on","f o","c a","w ere","b y","at e","p ro","t ed","oun d","ow n","w ould","t s","wh at","q u","al ly","i ght","c k","g r","wh en","v en","c an","ou gh","in e","en d","p er","ou s","o d","id e","k now","t y","ver y","s i","a k","wh o","ab out","i ll","the m","es t","re d","y e","c ould","on g","you r","the ir","e m","j ust","o ther","in to","an y","wh i","u m","t w","as t","d er","d id","i e","be en","ac e","in k","it y","b ack","t ing","b r","mo re","a ke","p p","the n","s p","e l","u se","b l","sa id","o ver","ge t"]}}
+79
View File
@@ -0,0 +1,79 @@
import bisect
import numpy as np
import torch
def _pad_data(x, length):
_pad = 0
assert x.ndim == 1
return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=_pad)
def prepare_data(inputs):
max_len = max((len(x) for x in inputs))
return np.stack([_pad_data(x, max_len) for x in inputs])
def _pad_tensor(x, length):
_pad = 0.0
assert x.ndim == 2
x = np.pad(x, [[0, 0], [0, length - x.shape[1]]], mode="constant", constant_values=_pad)
return x
def prepare_tensor(inputs, out_steps):
max_len = max((x.shape[1] for x in inputs))
remainder = max_len % out_steps
pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
return np.stack([_pad_tensor(x, pad_len) for x in inputs])
def _pad_stop_target(x: np.ndarray, length: int, pad_val=1) -> np.ndarray:
"""Pad stop target array.
Args:
x (np.ndarray): Stop target array.
length (int): Length after padding.
pad_val (int, optional): Padding value. Defaults to 1.
Returns:
np.ndarray: Padded stop target array.
"""
assert x.ndim == 1
return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=pad_val)
def prepare_stop_target(inputs, out_steps):
"""Pad row vectors with 1."""
max_len = max((x.shape[0] for x in inputs))
remainder = max_len % out_steps
pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
return np.stack([_pad_stop_target(x, pad_len) for x in inputs])
def pad_per_step(inputs, pad_len):
return np.pad(inputs, [[0, 0], [0, 0], [0, pad_len]], mode="constant", constant_values=0.0)
def get_length_balancer_weights(items: list, num_buckets=10):
# get all durations
audio_lengths = np.array([item["audio_length"] for item in items])
# create the $num_buckets buckets classes based in the dataset max and min length
max_length = int(max(audio_lengths))
min_length = int(min(audio_lengths))
step = int((max_length - min_length) / num_buckets) + 1
buckets_classes = [i + step for i in range(min_length, (max_length - step) + num_buckets + 1, step)]
# add each sample in their respective length bucket
buckets_names = np.array(
[buckets_classes[bisect.bisect_left(buckets_classes, item["audio_length"])] for item in items]
)
# count and compute the weights_bucket for each sample
unique_buckets_names = np.unique(buckets_names).tolist()
bucket_ids = [unique_buckets_names.index(l) for l in buckets_names]
bucket_count = np.array([len(np.where(buckets_names == l)[0]) for l in unique_buckets_names])
weight_bucket = 1.0 / bucket_count
dataset_samples_weight = np.array([weight_bucket[l] for l in bucket_ids])
# normalize
dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
return torch.from_numpy(dataset_samples_weight).float()
+48
View File
@@ -0,0 +1,48 @@
import torch
def rehash_fairseq_vits_checkpoint(checkpoint_file):
chk = torch.load(checkpoint_file, map_location=torch.device("cpu"))["model"]
new_chk = {}
for k, v in chk.items():
if "enc_p." in k:
new_chk[k.replace("enc_p.", "text_encoder.")] = v
elif "dec." in k:
new_chk[k.replace("dec.", "waveform_decoder.")] = v
elif "enc_q." in k:
new_chk[k.replace("enc_q.", "posterior_encoder.")] = v
elif "flow.flows.2." in k:
new_chk[k.replace("flow.flows.2.", "flow.flows.1.")] = v
elif "flow.flows.4." in k:
new_chk[k.replace("flow.flows.4.", "flow.flows.2.")] = v
elif "flow.flows.6." in k:
new_chk[k.replace("flow.flows.6.", "flow.flows.3.")] = v
elif "dp.flows.0.m" in k:
new_chk[k.replace("dp.flows.0.m", "duration_predictor.flows.0.translation")] = v
elif "dp.flows.0.logs" in k:
new_chk[k.replace("dp.flows.0.logs", "duration_predictor.flows.0.log_scale")] = v
elif "dp.flows.1" in k:
new_chk[k.replace("dp.flows.1", "duration_predictor.flows.1")] = v
elif "dp.flows.3" in k:
new_chk[k.replace("dp.flows.3", "duration_predictor.flows.2")] = v
elif "dp.flows.5" in k:
new_chk[k.replace("dp.flows.5", "duration_predictor.flows.3")] = v
elif "dp.flows.7" in k:
new_chk[k.replace("dp.flows.7", "duration_predictor.flows.4")] = v
elif "dp.post_flows.0.m" in k:
new_chk[k.replace("dp.post_flows.0.m", "duration_predictor.post_flows.0.translation")] = v
elif "dp.post_flows.0.logs" in k:
new_chk[k.replace("dp.post_flows.0.logs", "duration_predictor.post_flows.0.log_scale")] = v
elif "dp.post_flows.1" in k:
new_chk[k.replace("dp.post_flows.1", "duration_predictor.post_flows.1")] = v
elif "dp.post_flows.3" in k:
new_chk[k.replace("dp.post_flows.3", "duration_predictor.post_flows.2")] = v
elif "dp.post_flows.5" in k:
new_chk[k.replace("dp.post_flows.5", "duration_predictor.post_flows.3")] = v
elif "dp.post_flows.7" in k:
new_chk[k.replace("dp.post_flows.7", "duration_predictor.post_flows.4")] = v
elif "dp." in k:
new_chk[k.replace("dp.", "duration_predictor.")] = v
else:
new_chk[k] = v
return new_chk
+258
View File
@@ -0,0 +1,258 @@
import numpy as np
import torch
from scipy.stats import betabinom
from torch.nn import functional as F
try:
from TTS.tts.utils.monotonic_align.core import maximum_path_c
CYTHON = True
except ModuleNotFoundError:
CYTHON = False
class StandardScaler:
"""StandardScaler for mean-scale normalization with the given mean and scale values."""
def __init__(self, mean: np.ndarray = None, scale: np.ndarray = None) -> None:
self.mean_ = mean
self.scale_ = scale
def set_stats(self, mean, scale):
self.mean_ = mean
self.scale_ = scale
def reset_stats(self):
delattr(self, "mean_")
delattr(self, "scale_")
def transform(self, X):
X = np.asarray(X)
X -= self.mean_
X /= self.scale_
return X
def inverse_transform(self, X):
X = np.asarray(X)
X *= self.scale_
X += self.mean_
return X
# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1
def sequence_mask(sequence_length, max_len=None):
"""Create a sequence mask for filtering padding in a sequence tensor.
Args:
sequence_length (torch.tensor): Sequence lengths.
max_len (int, Optional): Maximum sequence length. Defaults to None.
Shapes:
- mask: :math:`[B, T_max]`
"""
if max_len is None:
max_len = sequence_length.max()
seq_range = torch.arange(max_len, dtype=sequence_length.dtype, device=sequence_length.device)
# B x T_max
return seq_range.unsqueeze(0) < sequence_length.unsqueeze(1)
def segment(x: torch.tensor, segment_indices: torch.tensor, segment_size=4, pad_short=False):
"""Segment each sample in a batch based on the provided segment indices
Args:
x (torch.tensor): Input tensor.
segment_indices (torch.tensor): Segment indices.
segment_size (int): Expected output segment size.
pad_short (bool): Pad the end of input tensor with zeros if shorter than the segment size.
"""
# pad the input tensor if it is shorter than the segment size
if pad_short and x.shape[-1] < segment_size:
x = torch.nn.functional.pad(x, (0, segment_size - x.size(2)))
segments = torch.zeros_like(x[:, :, :segment_size])
for i in range(x.size(0)):
index_start = segment_indices[i]
index_end = index_start + segment_size
x_i = x[i]
if pad_short and index_end >= x.size(2):
# pad the sample if it is shorter than the segment size
x_i = torch.nn.functional.pad(x_i, (0, (index_end + 1) - x.size(2)))
segments[i] = x_i[:, index_start:index_end]
return segments
def rand_segments(
x: torch.tensor, x_lengths: torch.tensor = None, segment_size=4, let_short_samples=False, pad_short=False
):
"""Create random segments based on the input lengths.
Args:
x (torch.tensor): Input tensor.
x_lengths (torch.tensor): Input lengths.
segment_size (int): Expected output segment size.
let_short_samples (bool): Allow shorter samples than the segment size.
pad_short (bool): Pad the end of input tensor with zeros if shorter than the segment size.
Shapes:
- x: :math:`[B, C, T]`
- x_lengths: :math:`[B]`
"""
_x_lenghts = x_lengths.clone()
B, _, T = x.size()
if pad_short:
if T < segment_size:
x = torch.nn.functional.pad(x, (0, segment_size - T))
T = segment_size
if _x_lenghts is None:
_x_lenghts = T
len_diff = _x_lenghts - segment_size
if let_short_samples:
_x_lenghts[len_diff < 0] = segment_size
len_diff = _x_lenghts - segment_size
else:
assert all(
len_diff > 0
), f" [!] At least one sample is shorter than the segment size ({segment_size}). \n {_x_lenghts}"
segment_indices = (torch.rand([B]).type_as(x) * (len_diff + 1)).long()
ret = segment(x, segment_indices, segment_size, pad_short=pad_short)
return ret, segment_indices
def average_over_durations(values, durs):
"""Average values over durations.
Shapes:
- values: :math:`[B, 1, T_de]`
- durs: :math:`[B, T_en]`
- avg: :math:`[B, 1, T_en]`
"""
durs_cums_ends = torch.cumsum(durs, dim=1).long()
durs_cums_starts = torch.nn.functional.pad(durs_cums_ends[:, :-1], (1, 0))
values_nonzero_cums = torch.nn.functional.pad(torch.cumsum(values != 0.0, dim=2), (1, 0))
values_cums = torch.nn.functional.pad(torch.cumsum(values, dim=2), (1, 0))
bs, l = durs_cums_ends.size()
n_formants = values.size(1)
dcs = durs_cums_starts[:, None, :].expand(bs, n_formants, l)
dce = durs_cums_ends[:, None, :].expand(bs, n_formants, l)
values_sums = (torch.gather(values_cums, 2, dce) - torch.gather(values_cums, 2, dcs)).float()
values_nelems = (torch.gather(values_nonzero_cums, 2, dce) - torch.gather(values_nonzero_cums, 2, dcs)).float()
avg = torch.where(values_nelems == 0.0, values_nelems, values_sums / values_nelems)
return avg
def convert_pad_shape(pad_shape):
l = pad_shape[::-1]
pad_shape = [item for sublist in l for item in sublist]
return pad_shape
def generate_path(duration, mask):
"""
Shapes:
- duration: :math:`[B, T_en]`
- mask: :math:'[B, T_en, T_de]`
- path: :math:`[B, T_en, T_de]`
"""
b, t_x, t_y = mask.shape
cum_duration = torch.cumsum(duration, 1)
cum_duration_flat = cum_duration.view(b * t_x)
path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
path = path.view(b, t_x, t_y)
path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
path = path * mask
return path
def maximum_path(value, mask):
if CYTHON:
return maximum_path_cython(value, mask)
return maximum_path_numpy(value, mask)
def maximum_path_cython(value, mask):
"""Cython optimised version.
Shapes:
- value: :math:`[B, T_en, T_de]`
- mask: :math:`[B, T_en, T_de]`
"""
value = value * mask
device = value.device
dtype = value.dtype
value = value.data.cpu().numpy().astype(np.float32)
path = np.zeros_like(value).astype(np.int32)
mask = mask.data.cpu().numpy()
t_x_max = mask.sum(1)[:, 0].astype(np.int32)
t_y_max = mask.sum(2)[:, 0].astype(np.int32)
maximum_path_c(path, value, t_x_max, t_y_max)
return torch.from_numpy(path).to(device=device, dtype=dtype)
def maximum_path_numpy(value, mask, max_neg_val=None):
"""
Monotonic alignment search algorithm
Numpy-friendly version. It's about 4 times faster than torch version.
value: [b, t_x, t_y]
mask: [b, t_x, t_y]
"""
if max_neg_val is None:
max_neg_val = -np.inf # Patch for Sphinx complaint
value = value * mask
device = value.device
dtype = value.dtype
value = value.cpu().detach().numpy()
mask = mask.cpu().detach().numpy().astype(bool)
b, t_x, t_y = value.shape
direction = np.zeros(value.shape, dtype=np.int64)
v = np.zeros((b, t_x), dtype=np.float32)
x_range = np.arange(t_x, dtype=np.float32).reshape(1, -1)
for j in range(t_y):
v0 = np.pad(v, [[0, 0], [1, 0]], mode="constant", constant_values=max_neg_val)[:, :-1]
v1 = v
max_mask = v1 >= v0
v_max = np.where(max_mask, v1, v0)
direction[:, :, j] = max_mask
index_mask = x_range <= j
v = np.where(index_mask, v_max + value[:, :, j], max_neg_val)
direction = np.where(mask, direction, 1)
path = np.zeros(value.shape, dtype=np.float32)
index = mask[:, :, 0].sum(1).astype(np.int64) - 1
index_range = np.arange(b)
for j in reversed(range(t_y)):
path[index_range, index, j] = 1
index = index + direction[index_range, index, j] - 1
path = path * mask.astype(np.float32)
path = torch.from_numpy(path).to(device=device, dtype=dtype)
return path
def beta_binomial_prior_distribution(phoneme_count, mel_count, scaling_factor=1.0):
P, M = phoneme_count, mel_count
x = np.arange(0, P)
mel_text_probs = []
for i in range(1, M + 1):
a, b = scaling_factor * i, scaling_factor * (M + 1 - i)
rv = betabinom(P, a, b)
mel_i_prob = rv.pmf(x)
mel_text_probs.append(mel_i_prob)
return np.array(mel_text_probs)
def compute_attn_prior(x_len, y_len, scaling_factor=1.0):
"""Compute attention priors for the alignment network."""
attn_prior = beta_binomial_prior_distribution(
x_len,
y_len,
scaling_factor,
)
return attn_prior # [y_len, x_len]
+125
View File
@@ -0,0 +1,125 @@
import os
from typing import Any, Dict, List
import fsspec
import numpy as np
import torch
from coqpit import Coqpit
from TTS.config import check_config_and_model_args
from TTS.tts.utils.managers import BaseIDManager
class LanguageManager(BaseIDManager):
"""Manage the languages for multi-lingual 🐸TTS models. Load a datafile and parse the information
in a way that can be queried by language.
Args:
language_ids_file_path (str, optional): Path to the metafile that maps language names to ids used by
TTS models. Defaults to "".
config (Coqpit, optional): Coqpit config that contains the language information in the datasets filed.
Defaults to None.
Examples:
>>> manager = LanguageManager(language_ids_file_path=language_ids_file_path)
>>> language_id_mapper = manager.language_ids
"""
def __init__(
self,
language_ids_file_path: str = "",
config: Coqpit = None,
):
super().__init__(id_file_path=language_ids_file_path)
if config:
self.set_language_ids_from_config(config)
@property
def num_languages(self) -> int:
return len(list(self.name_to_id.keys()))
@property
def language_names(self) -> List:
return list(self.name_to_id.keys())
@staticmethod
def parse_language_ids_from_config(c: Coqpit) -> Dict:
"""Set language id from config.
Args:
c (Coqpit): Config
Returns:
Tuple[Dict, int]: Language ID mapping and the number of languages.
"""
languages = set({})
for dataset in c.datasets:
if "language" in dataset:
languages.add(dataset["language"])
else:
raise ValueError(f"Dataset {dataset['name']} has no language specified.")
return {name: i for i, name in enumerate(sorted(list(languages)))}
def set_language_ids_from_config(self, c: Coqpit) -> None:
"""Set language IDs from config samples.
Args:
c (Coqpit): Config.
"""
self.name_to_id = self.parse_language_ids_from_config(c)
@staticmethod
def parse_ids_from_data(items: List, parse_key: str) -> Any:
raise NotImplementedError
def set_ids_from_data(self, items: List, parse_key: str) -> Any:
raise NotImplementedError
def save_ids_to_file(self, file_path: str) -> None:
"""Save language IDs to a json file.
Args:
file_path (str): Path to the output file.
"""
self._save_json(file_path, self.name_to_id)
@staticmethod
def init_from_config(config: Coqpit) -> "LanguageManager":
"""Initialize the language manager from a Coqpit config.
Args:
config (Coqpit): Coqpit config.
"""
language_manager = None
if check_config_and_model_args(config, "use_language_embedding", True):
if config.get("language_ids_file", None):
language_manager = LanguageManager(language_ids_file_path=config.language_ids_file)
language_manager = LanguageManager(config=config)
return language_manager
def _set_file_path(path):
"""Find the language_ids.json under the given path or the above it.
Intended to band aid the different paths returned in restored and continued training."""
path_restore = os.path.join(os.path.dirname(path), "language_ids.json")
path_continue = os.path.join(path, "language_ids.json")
fs = fsspec.get_mapper(path).fs
if fs.exists(path_restore):
return path_restore
if fs.exists(path_continue):
return path_continue
return None
def get_language_balancer_weights(items: list):
language_names = np.array([item["language"] for item in items])
unique_language_names = np.unique(language_names).tolist()
language_ids = [unique_language_names.index(l) for l in language_names]
language_count = np.array([len(np.where(language_names == l)[0]) for l in unique_language_names])
weight_language = 1.0 / language_count
# get weight for each sample
dataset_samples_weight = np.array([weight_language[l] for l in language_ids])
# normalize
dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
return torch.from_numpy(dataset_samples_weight).float()
+383
View File
@@ -0,0 +1,383 @@
import json
import random
from typing import Any, Dict, List, Tuple, Union
import fsspec
import numpy as np
import torch
from TTS.config import load_config
from TTS.encoder.utils.generic_utils import setup_encoder_model
from TTS.utils.audio import AudioProcessor
def load_file(path: str):
if path.endswith(".json"):
with fsspec.open(path, "r") as f:
return json.load(f)
elif path.endswith(".pth"):
with fsspec.open(path, "rb") as f:
return torch.load(f, map_location="cpu")
else:
raise ValueError("Unsupported file type")
def save_file(obj: Any, path: str):
if path.endswith(".json"):
with fsspec.open(path, "w") as f:
json.dump(obj, f, indent=4)
elif path.endswith(".pth"):
with fsspec.open(path, "wb") as f:
torch.save(obj, f)
else:
raise ValueError("Unsupported file type")
class BaseIDManager:
"""Base `ID` Manager class. Every new `ID` manager must inherit this.
It defines common `ID` manager specific functions.
"""
def __init__(self, id_file_path: str = ""):
self.name_to_id = {}
if id_file_path:
self.load_ids_from_file(id_file_path)
@staticmethod
def _load_json(json_file_path: str) -> Dict:
with fsspec.open(json_file_path, "r") as f:
return json.load(f)
@staticmethod
def _save_json(json_file_path: str, data: dict) -> None:
with fsspec.open(json_file_path, "w") as f:
json.dump(data, f, indent=4)
def set_ids_from_data(self, items: List, parse_key: str) -> None:
"""Set IDs from data samples.
Args:
items (List): Data sampled returned by `load_tts_samples()`.
"""
self.name_to_id = self.parse_ids_from_data(items, parse_key=parse_key)
def load_ids_from_file(self, file_path: str) -> None:
"""Set IDs from a file.
Args:
file_path (str): Path to the file.
"""
self.name_to_id = load_file(file_path)
def save_ids_to_file(self, file_path: str) -> None:
"""Save IDs to a json file.
Args:
file_path (str): Path to the output file.
"""
save_file(self.name_to_id, file_path)
def get_random_id(self) -> Any:
"""Get a random embedding.
Args:
Returns:
np.ndarray: embedding.
"""
if self.name_to_id:
return self.name_to_id[random.choices(list(self.name_to_id.keys()))[0]]
return None
@staticmethod
def parse_ids_from_data(items: List, parse_key: str) -> Tuple[Dict]:
"""Parse IDs from data samples retured by `load_tts_samples()`.
Args:
items (list): Data sampled returned by `load_tts_samples()`.
parse_key (str): The key to being used to parse the data.
Returns:
Tuple[Dict]: speaker IDs.
"""
classes = sorted({item[parse_key] for item in items})
ids = {name: i for i, name in enumerate(classes)}
return ids
class EmbeddingManager(BaseIDManager):
"""Base `Embedding` Manager class. Every new `Embedding` manager must inherit this.
It defines common `Embedding` manager specific functions.
It expects embeddings files in the following format:
::
{
'audio_file_key':{
'name': 'category_name',
'embedding'[<embedding_values>]
},
...
}
`audio_file_key` is a unique key to the audio file in the dataset. It can be the path to the file or any other unique key.
`embedding` is the embedding vector of the audio file.
`name` can be name of the speaker of the audio file.
"""
def __init__(
self,
embedding_file_path: Union[str, List[str]] = "",
id_file_path: str = "",
encoder_model_path: str = "",
encoder_config_path: str = "",
use_cuda: bool = False,
):
super().__init__(id_file_path=id_file_path)
self.embeddings = {}
self.embeddings_by_names = {}
self.clip_ids = []
self.encoder = None
self.encoder_ap = None
self.use_cuda = use_cuda
if embedding_file_path:
if isinstance(embedding_file_path, list):
self.load_embeddings_from_list_of_files(embedding_file_path)
else:
self.load_embeddings_from_file(embedding_file_path)
if encoder_model_path and encoder_config_path:
self.init_encoder(encoder_model_path, encoder_config_path, use_cuda)
@property
def num_embeddings(self):
"""Get number of embeddings."""
return len(self.embeddings)
@property
def num_names(self):
"""Get number of embeddings."""
return len(self.embeddings_by_names)
@property
def embedding_dim(self):
"""Dimensionality of embeddings. If embeddings are not loaded, returns zero."""
if self.embeddings:
return len(self.embeddings[list(self.embeddings.keys())[0]]["embedding"])
return 0
@property
def embedding_names(self):
"""Get embedding names."""
return list(self.embeddings_by_names.keys())
def save_embeddings_to_file(self, file_path: str) -> None:
"""Save embeddings to a json file.
Args:
file_path (str): Path to the output file.
"""
save_file(self.embeddings, file_path)
@staticmethod
def read_embeddings_from_file(file_path: str):
"""Load embeddings from a json file.
Args:
file_path (str): Path to the file.
"""
embeddings = load_file(file_path)
speakers = sorted({x["name"] for x in embeddings.values()})
name_to_id = {name: i for i, name in enumerate(speakers)}
clip_ids = list(set(sorted(clip_name for clip_name in embeddings.keys())))
# cache embeddings_by_names for fast inference using a bigger speakers.json
embeddings_by_names = {}
for x in embeddings.values():
if x["name"] not in embeddings_by_names.keys():
embeddings_by_names[x["name"]] = [x["embedding"]]
else:
embeddings_by_names[x["name"]].append(x["embedding"])
return name_to_id, clip_ids, embeddings, embeddings_by_names
def load_embeddings_from_file(self, file_path: str) -> None:
"""Load embeddings from a json file.
Args:
file_path (str): Path to the target json file.
"""
self.name_to_id, self.clip_ids, self.embeddings, self.embeddings_by_names = self.read_embeddings_from_file(
file_path
)
def load_embeddings_from_list_of_files(self, file_paths: List[str]) -> None:
"""Load embeddings from a list of json files and don't allow duplicate keys.
Args:
file_paths (List[str]): List of paths to the target json files.
"""
self.name_to_id = {}
self.clip_ids = []
self.embeddings_by_names = {}
self.embeddings = {}
for file_path in file_paths:
ids, clip_ids, embeddings, embeddings_by_names = self.read_embeddings_from_file(file_path)
# check colliding keys
duplicates = set(self.embeddings.keys()) & set(embeddings.keys())
if duplicates:
raise ValueError(f" [!] Duplicate embedding names <{duplicates}> in {file_path}")
# store values
self.name_to_id.update(ids)
self.clip_ids.extend(clip_ids)
self.embeddings_by_names.update(embeddings_by_names)
self.embeddings.update(embeddings)
# reset name_to_id to get the right speaker ids
self.name_to_id = {name: i for i, name in enumerate(self.name_to_id)}
def get_embedding_by_clip(self, clip_idx: str) -> List:
"""Get embedding by clip ID.
Args:
clip_idx (str): Target clip ID.
Returns:
List: embedding as a list.
"""
return self.embeddings[clip_idx]["embedding"]
def get_embeddings_by_name(self, idx: str) -> List[List]:
"""Get all embeddings of a speaker.
Args:
idx (str): Target name.
Returns:
List[List]: all the embeddings of the given speaker.
"""
return self.embeddings_by_names[idx]
def get_embeddings_by_names(self) -> Dict:
"""Get all embeddings by names.
Returns:
Dict: all the embeddings of each speaker.
"""
embeddings_by_names = {}
for x in self.embeddings.values():
if x["name"] not in embeddings_by_names.keys():
embeddings_by_names[x["name"]] = [x["embedding"]]
else:
embeddings_by_names[x["name"]].append(x["embedding"])
return embeddings_by_names
def get_mean_embedding(self, idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray:
"""Get mean embedding of a idx.
Args:
idx (str): Target name.
num_samples (int, optional): Number of samples to be averaged. Defaults to None.
randomize (bool, optional): Pick random `num_samples` of embeddings. Defaults to False.
Returns:
np.ndarray: Mean embedding.
"""
embeddings = self.get_embeddings_by_name(idx)
if num_samples is None:
embeddings = np.stack(embeddings).mean(0)
else:
assert len(embeddings) >= num_samples, f" [!] {idx} has number of samples < {num_samples}"
if randomize:
embeddings = np.stack(random.choices(embeddings, k=num_samples)).mean(0)
else:
embeddings = np.stack(embeddings[:num_samples]).mean(0)
return embeddings
def get_random_embedding(self) -> Any:
"""Get a random embedding.
Args:
Returns:
np.ndarray: embedding.
"""
if self.embeddings:
return self.embeddings[random.choices(list(self.embeddings.keys()))[0]]["embedding"]
return None
def get_clips(self) -> List:
return sorted(self.embeddings.keys())
def init_encoder(self, model_path: str, config_path: str, use_cuda=False) -> None:
"""Initialize a speaker encoder model.
Args:
model_path (str): Model file path.
config_path (str): Model config file path.
use_cuda (bool, optional): Use CUDA. Defaults to False.
"""
self.use_cuda = use_cuda
self.encoder_config = load_config(config_path)
self.encoder = setup_encoder_model(self.encoder_config)
self.encoder_criterion = self.encoder.load_checkpoint(
self.encoder_config, model_path, eval=True, use_cuda=use_cuda, cache=True
)
self.encoder_ap = AudioProcessor(**self.encoder_config.audio)
def compute_embedding_from_clip(self, wav_file: Union[str, List[str]]) -> list:
"""Compute a embedding from a given audio file.
Args:
wav_file (Union[str, List[str]]): Target file path.
Returns:
list: Computed embedding.
"""
def _compute(wav_file: str):
waveform = self.encoder_ap.load_wav(wav_file, sr=self.encoder_ap.sample_rate)
if not self.encoder_config.model_params.get("use_torch_spec", False):
m_input = self.encoder_ap.melspectrogram(waveform)
m_input = torch.from_numpy(m_input)
else:
m_input = torch.from_numpy(waveform)
if self.use_cuda:
m_input = m_input.cuda()
m_input = m_input.unsqueeze(0)
embedding = self.encoder.compute_embedding(m_input)
return embedding
if isinstance(wav_file, list):
# compute the mean embedding
embeddings = None
for wf in wav_file:
embedding = _compute(wf)
if embeddings is None:
embeddings = embedding
else:
embeddings += embedding
return (embeddings / len(wav_file))[0].tolist()
embedding = _compute(wav_file)
return embedding[0].tolist()
def compute_embeddings(self, feats: Union[torch.Tensor, np.ndarray]) -> List:
"""Compute embedding from features.
Args:
feats (Union[torch.Tensor, np.ndarray]): Input features.
Returns:
List: computed embedding.
"""
if isinstance(feats, np.ndarray):
feats = torch.from_numpy(feats)
if feats.ndim == 2:
feats = feats.unsqueeze(0)
if self.use_cuda:
feats = feats.cuda()
return self.encoder.compute_embedding(feats)
+15
View File
@@ -0,0 +1,15 @@
def alignment_diagonal_score(alignments, binary=False):
"""
Compute how diagonal alignment predictions are. It is useful
to measure the alignment consistency of a model
Args:
alignments (torch.Tensor): batch of alignments.
binary (bool): if True, ignore scores and consider attention
as a binary mask.
Shape:
- alignments : :math:`[B, T_de, T_en]`
"""
maxs = alignments.max(dim=1)[0]
if binary:
maxs[maxs > 0] = 1
return maxs.mean(dim=1).mean(dim=0).item()
+47
View File
@@ -0,0 +1,47 @@
import numpy as np
cimport cython
cimport numpy as np
from cython.parallel import prange
@cython.boundscheck(False)
@cython.wraparound(False)
cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil:
cdef int x
cdef int y
cdef float v_prev
cdef float v_cur
cdef float tmp
cdef int index = t_x - 1
for y in range(t_y):
for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
if x == y:
v_cur = max_neg_val
else:
v_cur = value[x, y-1]
if x == 0:
if y == 0:
v_prev = 0.
else:
v_prev = max_neg_val
else:
v_prev = value[x-1, y-1]
value[x, y] = max(v_cur, v_prev) + value[x, y]
for y in range(t_y - 1, -1, -1):
path[index, y] = 1
if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]):
index = index - 1
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil:
cdef int b = values.shape[0]
cdef int i
for i in prange(b, nogil=True):
maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val)
+7
View File
@@ -0,0 +1,7 @@
# from distutils.core import setup
# from Cython.Build import cythonize
# import numpy
# setup(name='monotonic_align',
# ext_modules=cythonize("core.pyx"),
# include_dirs=[numpy.get_include()])
+222
View File
@@ -0,0 +1,222 @@
import json
import os
from typing import Any, Dict, List, Union
import fsspec
import numpy as np
import torch
from coqpit import Coqpit
from TTS.config import get_from_config_or_model_args_with_default
from TTS.tts.utils.managers import EmbeddingManager
class SpeakerManager(EmbeddingManager):
"""Manage the speakers for multi-speaker 🐸TTS models. Load a datafile and parse the information
in a way that can be queried by speaker or clip.
There are 3 different scenarios considered:
1. Models using speaker embedding layers. The datafile only maps speaker names to ids used by the embedding layer.
2. Models using d-vectors. The datafile includes a dictionary in the following format.
::
{
'clip_name.wav':{
'name': 'speakerA',
'embedding'[<d_vector_values>]
},
...
}
3. Computing the d-vectors by the speaker encoder. It loads the speaker encoder model and
computes the d-vectors for a given clip or speaker.
Args:
d_vectors_file_path (str, optional): Path to the metafile including x vectors. Defaults to "".
speaker_id_file_path (str, optional): Path to the metafile that maps speaker names to ids used by
TTS models. Defaults to "".
encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "".
encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "".
Examples:
>>> # load audio processor and speaker encoder
>>> ap = AudioProcessor(**config.audio)
>>> manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path)
>>> # load a sample audio and compute embedding
>>> waveform = ap.load_wav(sample_wav_path)
>>> mel = ap.melspectrogram(waveform)
>>> d_vector = manager.compute_embeddings(mel.T)
"""
def __init__(
self,
data_items: List[List[Any]] = None,
d_vectors_file_path: str = "",
speaker_id_file_path: str = "",
encoder_model_path: str = "",
encoder_config_path: str = "",
use_cuda: bool = False,
):
super().__init__(
embedding_file_path=d_vectors_file_path,
id_file_path=speaker_id_file_path,
encoder_model_path=encoder_model_path,
encoder_config_path=encoder_config_path,
use_cuda=use_cuda,
)
if data_items:
self.set_ids_from_data(data_items, parse_key="speaker_name")
@property
def num_speakers(self):
return len(self.name_to_id)
@property
def speaker_names(self):
return list(self.name_to_id.keys())
def get_speakers(self) -> List:
return self.name_to_id
@staticmethod
def init_from_config(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "SpeakerManager":
"""Initialize a speaker manager from config
Args:
config (Coqpit): Config object.
samples (Union[List[List], List[Dict]], optional): List of data samples to parse out the speaker names.
Defaults to None.
Returns:
SpeakerEncoder: Speaker encoder object.
"""
speaker_manager = None
if get_from_config_or_model_args_with_default(config, "use_speaker_embedding", False):
if samples:
speaker_manager = SpeakerManager(data_items=samples)
if get_from_config_or_model_args_with_default(config, "speaker_file", None):
speaker_manager = SpeakerManager(
speaker_id_file_path=get_from_config_or_model_args_with_default(config, "speaker_file", None)
)
if get_from_config_or_model_args_with_default(config, "speakers_file", None):
speaker_manager = SpeakerManager(
speaker_id_file_path=get_from_config_or_model_args_with_default(config, "speakers_file", None)
)
if get_from_config_or_model_args_with_default(config, "use_d_vector_file", False):
speaker_manager = SpeakerManager()
if get_from_config_or_model_args_with_default(config, "d_vector_file", None):
speaker_manager = SpeakerManager(
d_vectors_file_path=get_from_config_or_model_args_with_default(config, "d_vector_file", None)
)
return speaker_manager
def _set_file_path(path):
"""Find the speakers.json under the given path or the above it.
Intended to band aid the different paths returned in restored and continued training."""
path_restore = os.path.join(os.path.dirname(path), "speakers.json")
path_continue = os.path.join(path, "speakers.json")
fs = fsspec.get_mapper(path).fs
if fs.exists(path_restore):
return path_restore
if fs.exists(path_continue):
return path_continue
raise FileNotFoundError(f" [!] `speakers.json` not found in {path}")
def load_speaker_mapping(out_path):
"""Loads speaker mapping if already present."""
if os.path.splitext(out_path)[1] == ".json":
json_file = out_path
else:
json_file = _set_file_path(out_path)
with fsspec.open(json_file, "r") as f:
return json.load(f)
def save_speaker_mapping(out_path, speaker_mapping):
"""Saves speaker mapping if not yet present."""
if out_path is not None:
speakers_json_path = _set_file_path(out_path)
with fsspec.open(speakers_json_path, "w") as f:
json.dump(speaker_mapping, f, indent=4)
def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, out_path: str = None) -> SpeakerManager:
"""Initiate a `SpeakerManager` instance by the provided config.
Args:
c (Coqpit): Model configuration.
restore_path (str): Path to a previous training folder.
data (List): Data samples used in training to infer speakers from. It must be provided if speaker embedding
layers is used. Defaults to None.
out_path (str, optional): Save the generated speaker IDs to a output path. Defaults to None.
Returns:
SpeakerManager: initialized and ready to use instance.
"""
speaker_manager = SpeakerManager()
if c.use_speaker_embedding:
if data is not None:
speaker_manager.set_ids_from_data(data, parse_key="speaker_name")
if restore_path:
speakers_file = _set_file_path(restore_path)
# restoring speaker manager from a previous run.
if c.use_d_vector_file:
# restore speaker manager with the embedding file
if not os.path.exists(speakers_file):
print("WARNING: speakers.json was not found in restore_path, trying to use CONFIG.d_vector_file")
if not os.path.exists(c.d_vector_file):
raise RuntimeError(
"You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.d_vector_file"
)
speaker_manager.load_embeddings_from_file(c.d_vector_file)
speaker_manager.load_embeddings_from_file(speakers_file)
elif not c.use_d_vector_file: # restor speaker manager with speaker ID file.
speaker_ids_from_data = speaker_manager.name_to_id
speaker_manager.load_ids_from_file(speakers_file)
assert all(
speaker in speaker_manager.name_to_id for speaker in speaker_ids_from_data
), " [!] You cannot introduce new speakers to a pre-trained model."
elif c.use_d_vector_file and c.d_vector_file:
# new speaker manager with external speaker embeddings.
speaker_manager.load_embeddings_from_file(c.d_vector_file)
elif c.use_d_vector_file and not c.d_vector_file:
raise "use_d_vector_file is True, so you need pass a external speaker embedding file."
elif c.use_speaker_embedding and "speakers_file" in c and c.speakers_file:
# new speaker manager with speaker IDs file.
speaker_manager.load_ids_from_file(c.speakers_file)
if speaker_manager.num_speakers > 0:
print(
" > Speaker manager is loaded with {} speakers: {}".format(
speaker_manager.num_speakers, ", ".join(speaker_manager.name_to_id)
)
)
# save file if path is defined
if out_path:
out_file_path = os.path.join(out_path, "speakers.json")
print(f" > Saving `speakers.json` to {out_file_path}.")
if c.use_d_vector_file and c.d_vector_file:
speaker_manager.save_embeddings_to_file(out_file_path)
else:
speaker_manager.save_ids_to_file(out_file_path)
return speaker_manager
def get_speaker_balancer_weights(items: list):
speaker_names = np.array([item["speaker_name"] for item in items])
unique_speaker_names = np.unique(speaker_names).tolist()
speaker_ids = [unique_speaker_names.index(l) for l in speaker_names]
speaker_count = np.array([len(np.where(speaker_names == l)[0]) for l in unique_speaker_names])
weight_speaker = 1.0 / speaker_count
dataset_samples_weight = np.array([weight_speaker[l] for l in speaker_ids])
# normalize
dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
return torch.from_numpy(dataset_samples_weight).float()
+383
View File
@@ -0,0 +1,383 @@
# Adopted from https://github.com/photosynthesis-team/piq
from typing import List, Optional, Tuple, Union
import torch
import torch.nn.functional as F
from torch.nn.modules.loss import _Loss
def _reduce(x: torch.Tensor, reduction: str = "mean") -> torch.Tensor:
r"""Reduce input in batch dimension if needed.
Args:
x: Tensor with shape (N, *).
reduction: Specifies the reduction type:
``'none'`` | ``'mean'`` | ``'sum'``. Default: ``'mean'``
"""
if reduction == "none":
return x
if reduction == "mean":
return x.mean(dim=0)
if reduction == "sum":
return x.sum(dim=0)
raise ValueError("Unknown reduction. Expected one of {'none', 'mean', 'sum'}")
def _validate_input(
tensors: List[torch.Tensor],
dim_range: Tuple[int, int] = (0, -1),
data_range: Tuple[float, float] = (0.0, -1.0),
# size_dim_range: Tuple[float, float] = (0., -1.),
size_range: Optional[Tuple[int, int]] = None,
) -> None:
r"""Check that input(-s) satisfies the requirements
Args:
tensors: Tensors to check
dim_range: Allowed number of dimensions. (min, max)
data_range: Allowed range of values in tensors. (min, max)
size_range: Dimensions to include in size comparison. (start_dim, end_dim + 1)
"""
if not __debug__:
return
x = tensors[0]
for t in tensors:
assert torch.is_tensor(t), f"Expected torch.Tensor, got {type(t)}"
assert t.device == x.device, f"Expected tensors to be on {x.device}, got {t.device}"
if size_range is None:
assert t.size() == x.size(), f"Expected tensors with same size, got {t.size()} and {x.size()}"
else:
assert (
t.size()[size_range[0] : size_range[1]] == x.size()[size_range[0] : size_range[1]]
), f"Expected tensors with same size at given dimensions, got {t.size()} and {x.size()}"
if dim_range[0] == dim_range[1]:
assert t.dim() == dim_range[0], f"Expected number of dimensions to be {dim_range[0]}, got {t.dim()}"
elif dim_range[0] < dim_range[1]:
assert (
dim_range[0] <= t.dim() <= dim_range[1]
), f"Expected number of dimensions to be between {dim_range[0]} and {dim_range[1]}, got {t.dim()}"
if data_range[0] < data_range[1]:
assert data_range[0] <= t.min(), f"Expected values to be greater or equal to {data_range[0]}, got {t.min()}"
assert t.max() <= data_range[1], f"Expected values to be lower or equal to {data_range[1]}, got {t.max()}"
def gaussian_filter(kernel_size: int, sigma: float) -> torch.Tensor:
r"""Returns 2D Gaussian kernel N(0,`sigma`^2)
Args:
size: Size of the kernel
sigma: Std of the distribution
Returns:
gaussian_kernel: Tensor with shape (1, kernel_size, kernel_size)
"""
coords = torch.arange(kernel_size, dtype=torch.float32)
coords -= (kernel_size - 1) / 2.0
g = coords**2
g = (-(g.unsqueeze(0) + g.unsqueeze(1)) / (2 * sigma**2)).exp()
g /= g.sum()
return g.unsqueeze(0)
def ssim(
x: torch.Tensor,
y: torch.Tensor,
kernel_size: int = 11,
kernel_sigma: float = 1.5,
data_range: Union[int, float] = 1.0,
reduction: str = "mean",
full: bool = False,
downsample: bool = True,
k1: float = 0.01,
k2: float = 0.03,
) -> List[torch.Tensor]:
r"""Interface of Structural Similarity (SSIM) index.
Inputs supposed to be in range ``[0, data_range]``.
To match performance with skimage and tensorflow set ``'downsample' = True``.
Args:
x: An input tensor. Shape :math:`(N, C, H, W)` or :math:`(N, C, H, W, 2)`.
y: A target tensor. Shape :math:`(N, C, H, W)` or :math:`(N, C, H, W, 2)`.
kernel_size: The side-length of the sliding window used in comparison. Must be an odd value.
kernel_sigma: Sigma of normal distribution.
data_range: Maximum value range of images (usually 1.0 or 255).
reduction: Specifies the reduction type:
``'none'`` | ``'mean'`` | ``'sum'``. Default:``'mean'``
full: Return cs map or not.
downsample: Perform average pool before SSIM computation. Default: True
k1: Algorithm parameter, K1 (small constant).
k2: Algorithm parameter, K2 (small constant).
Try a larger K2 constant (e.g. 0.4) if you get a negative or NaN results.
Returns:
Value of Structural Similarity (SSIM) index. In case of 5D input tensors, complex value is returned
as a tensor of size 2.
References:
Wang, Z., Bovik, A. C., Sheikh, H. R., & Simoncelli, E. P. (2004).
Image quality assessment: From error visibility to structural similarity.
IEEE Transactions on Image Processing, 13, 600-612.
https://ece.uwaterloo.ca/~z70wang/publications/ssim.pdf,
DOI: `10.1109/TIP.2003.819861`
"""
assert kernel_size % 2 == 1, f"Kernel size must be odd, got [{kernel_size}]"
_validate_input([x, y], dim_range=(4, 5), data_range=(0, data_range))
x = x / float(data_range)
y = y / float(data_range)
# Averagepool image if the size is large enough
f = max(1, round(min(x.size()[-2:]) / 256))
if (f > 1) and downsample:
x = F.avg_pool2d(x, kernel_size=f)
y = F.avg_pool2d(y, kernel_size=f)
kernel = gaussian_filter(kernel_size, kernel_sigma).repeat(x.size(1), 1, 1, 1).to(y)
_compute_ssim_per_channel = _ssim_per_channel_complex if x.dim() == 5 else _ssim_per_channel
ssim_map, cs_map = _compute_ssim_per_channel(x=x, y=y, kernel=kernel, k1=k1, k2=k2)
ssim_val = ssim_map.mean(1)
cs = cs_map.mean(1)
ssim_val = _reduce(ssim_val, reduction)
cs = _reduce(cs, reduction)
if full:
return [ssim_val, cs]
return ssim_val
class SSIMLoss(_Loss):
r"""Creates a criterion that measures the structural similarity index error between
each element in the input :math:`x` and target :math:`y`.
To match performance with skimage and tensorflow set ``'downsample' = True``.
The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
.. math::
SSIM = \{ssim_1,\dots,ssim_{N \times C}\}\\
ssim_{l}(x, y) = \frac{(2 \mu_x \mu_y + c_1) (2 \sigma_{xy} + c_2)}
{(\mu_x^2 +\mu_y^2 + c_1)(\sigma_x^2 +\sigma_y^2 + c_2)},
where :math:`N` is the batch size, `C` is the channel size. If :attr:`reduction` is not ``'none'``
(default ``'mean'``), then:
.. math::
SSIMLoss(x, y) =
\begin{cases}
\operatorname{mean}(1 - SSIM), & \text{if reduction} = \text{'mean';}\\
\operatorname{sum}(1 - SSIM), & \text{if reduction} = \text{'sum'.}
\end{cases}
:math:`x` and :math:`y` are tensors of arbitrary shapes with a total
of :math:`n` elements each.
The sum operation still operates over all the elements, and divides by :math:`n`.
The division by :math:`n` can be avoided if one sets ``reduction = 'sum'``.
In case of 5D input tensors, complex value is returned as a tensor of size 2.
Args:
kernel_size: By default, the mean and covariance of a pixel is obtained
by convolution with given filter_size.
kernel_sigma: Standard deviation for Gaussian kernel.
k1: Coefficient related to c1 in the above equation.
k2: Coefficient related to c2 in the above equation.
downsample: Perform average pool before SSIM computation. Default: True
reduction: Specifies the reduction type:
``'none'`` | ``'mean'`` | ``'sum'``. Default:``'mean'``
data_range: Maximum value range of images (usually 1.0 or 255).
Examples:
>>> loss = SSIMLoss()
>>> x = torch.rand(3, 3, 256, 256, requires_grad=True)
>>> y = torch.rand(3, 3, 256, 256)
>>> output = loss(x, y)
>>> output.backward()
References:
Wang, Z., Bovik, A. C., Sheikh, H. R., & Simoncelli, E. P. (2004).
Image quality assessment: From error visibility to structural similarity.
IEEE Transactions on Image Processing, 13, 600-612.
https://ece.uwaterloo.ca/~z70wang/publications/ssim.pdf,
DOI:`10.1109/TIP.2003.819861`
"""
__constants__ = ["kernel_size", "k1", "k2", "sigma", "kernel", "reduction"]
def __init__(
self,
kernel_size: int = 11,
kernel_sigma: float = 1.5,
k1: float = 0.01,
k2: float = 0.03,
downsample: bool = True,
reduction: str = "mean",
data_range: Union[int, float] = 1.0,
) -> None:
super().__init__()
# Generic loss parameters.
self.reduction = reduction
# Loss-specific parameters.
self.kernel_size = kernel_size
# This check might look redundant because kernel size is checked within the ssim function anyway.
# However, this check allows to fail fast when the loss is being initialised and training has not been started.
assert kernel_size % 2 == 1, f"Kernel size must be odd, got [{kernel_size}]"
self.kernel_sigma = kernel_sigma
self.k1 = k1
self.k2 = k2
self.downsample = downsample
self.data_range = data_range
def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
r"""Computation of Structural Similarity (SSIM) index as a loss function.
Args:
x: An input tensor. Shape :math:`(N, C, H, W)` or :math:`(N, C, H, W, 2)`.
y: A target tensor. Shape :math:`(N, C, H, W)` or :math:`(N, C, H, W, 2)`.
Returns:
Value of SSIM loss to be minimized, i.e ``1 - ssim`` in [0, 1] range. In case of 5D input tensors,
complex value is returned as a tensor of size 2.
"""
score = ssim(
x=x,
y=y,
kernel_size=self.kernel_size,
kernel_sigma=self.kernel_sigma,
downsample=self.downsample,
data_range=self.data_range,
reduction=self.reduction,
full=False,
k1=self.k1,
k2=self.k2,
)
return torch.ones_like(score) - score
def _ssim_per_channel(
x: torch.Tensor,
y: torch.Tensor,
kernel: torch.Tensor,
k1: float = 0.01,
k2: float = 0.03,
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
r"""Calculate Structural Similarity (SSIM) index for X and Y per channel.
Args:
x: An input tensor. Shape :math:`(N, C, H, W)`.
y: A target tensor. Shape :math:`(N, C, H, W)`.
kernel: 2D Gaussian kernel.
k1: Algorithm parameter, K1 (small constant, see [1]).
k2: Algorithm parameter, K2 (small constant, see [1]).
Try a larger K2 constant (e.g. 0.4) if you get a negative or NaN results.
Returns:
Full Value of Structural Similarity (SSIM) index.
"""
if x.size(-1) < kernel.size(-1) or x.size(-2) < kernel.size(-2):
raise ValueError(
f"Kernel size can't be greater than actual input size. Input size: {x.size()}. "
f"Kernel size: {kernel.size()}"
)
c1 = k1**2
c2 = k2**2
n_channels = x.size(1)
mu_x = F.conv2d(x, weight=kernel, stride=1, padding=0, groups=n_channels)
mu_y = F.conv2d(y, weight=kernel, stride=1, padding=0, groups=n_channels)
mu_xx = mu_x**2
mu_yy = mu_y**2
mu_xy = mu_x * mu_y
sigma_xx = F.conv2d(x**2, weight=kernel, stride=1, padding=0, groups=n_channels) - mu_xx
sigma_yy = F.conv2d(y**2, weight=kernel, stride=1, padding=0, groups=n_channels) - mu_yy
sigma_xy = F.conv2d(x * y, weight=kernel, stride=1, padding=0, groups=n_channels) - mu_xy
# Contrast sensitivity (CS) with alpha = beta = gamma = 1.
cs = (2.0 * sigma_xy + c2) / (sigma_xx + sigma_yy + c2)
# Structural similarity (SSIM)
ss = (2.0 * mu_xy + c1) / (mu_xx + mu_yy + c1) * cs
ssim_val = ss.mean(dim=(-1, -2))
cs = cs.mean(dim=(-1, -2))
return ssim_val, cs
def _ssim_per_channel_complex(
x: torch.Tensor,
y: torch.Tensor,
kernel: torch.Tensor,
k1: float = 0.01,
k2: float = 0.03,
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
r"""Calculate Structural Similarity (SSIM) index for Complex X and Y per channel.
Args:
x: An input tensor. Shape :math:`(N, C, H, W, 2)`.
y: A target tensor. Shape :math:`(N, C, H, W, 2)`.
kernel: 2-D gauss kernel.
k1: Algorithm parameter, K1 (small constant, see [1]).
k2: Algorithm parameter, K2 (small constant, see [1]).
Try a larger K2 constant (e.g. 0.4) if you get a negative or NaN results.
Returns:
Full Value of Complex Structural Similarity (SSIM) index.
"""
n_channels = x.size(1)
if x.size(-2) < kernel.size(-1) or x.size(-3) < kernel.size(-2):
raise ValueError(
f"Kernel size can't be greater than actual input size. Input size: {x.size()}. "
f"Kernel size: {kernel.size()}"
)
c1 = k1**2
c2 = k2**2
x_real = x[..., 0]
x_imag = x[..., 1]
y_real = y[..., 0]
y_imag = y[..., 1]
mu1_real = F.conv2d(x_real, weight=kernel, stride=1, padding=0, groups=n_channels)
mu1_imag = F.conv2d(x_imag, weight=kernel, stride=1, padding=0, groups=n_channels)
mu2_real = F.conv2d(y_real, weight=kernel, stride=1, padding=0, groups=n_channels)
mu2_imag = F.conv2d(y_imag, weight=kernel, stride=1, padding=0, groups=n_channels)
mu1_sq = mu1_real.pow(2) + mu1_imag.pow(2)
mu2_sq = mu2_real.pow(2) + mu2_imag.pow(2)
mu1_mu2_real = mu1_real * mu2_real - mu1_imag * mu2_imag
mu1_mu2_imag = mu1_real * mu2_imag + mu1_imag * mu2_real
compensation = 1.0
x_sq = x_real.pow(2) + x_imag.pow(2)
y_sq = y_real.pow(2) + y_imag.pow(2)
x_y_real = x_real * y_real - x_imag * y_imag
x_y_imag = x_real * y_imag + x_imag * y_real
sigma1_sq = F.conv2d(x_sq, weight=kernel, stride=1, padding=0, groups=n_channels) - mu1_sq
sigma2_sq = F.conv2d(y_sq, weight=kernel, stride=1, padding=0, groups=n_channels) - mu2_sq
sigma12_real = F.conv2d(x_y_real, weight=kernel, stride=1, padding=0, groups=n_channels) - mu1_mu2_real
sigma12_imag = F.conv2d(x_y_imag, weight=kernel, stride=1, padding=0, groups=n_channels) - mu1_mu2_imag
sigma12 = torch.stack((sigma12_imag, sigma12_real), dim=-1)
mu1_mu2 = torch.stack((mu1_mu2_real, mu1_mu2_imag), dim=-1)
# Set alpha = beta = gamma = 1.
cs_map = (sigma12 * 2 + c2 * compensation) / (sigma1_sq.unsqueeze(-1) + sigma2_sq.unsqueeze(-1) + c2 * compensation)
ssim_map = (mu1_mu2 * 2 + c1 * compensation) / (mu1_sq.unsqueeze(-1) + mu2_sq.unsqueeze(-1) + c1 * compensation)
ssim_map = ssim_map * cs_map
ssim_val = ssim_map.mean(dim=(-2, -3))
cs = cs_map.mean(dim=(-2, -3))
return ssim_val, cs
+343
View File
@@ -0,0 +1,343 @@
from typing import Dict
import numpy as np
import torch
from torch import nn
def numpy_to_torch(np_array, dtype, cuda=False, device="cpu"):
if cuda:
device = "cuda"
if np_array is None:
return None
tensor = torch.as_tensor(np_array, dtype=dtype, device=device)
return tensor
def compute_style_mel(style_wav, ap, cuda=False, device="cpu"):
if cuda:
device = "cuda"
style_mel = torch.FloatTensor(
ap.melspectrogram(ap.load_wav(style_wav, sr=ap.sample_rate)),
device=device,
).unsqueeze(0)
return style_mel
def run_model_torch(
model: nn.Module,
inputs: torch.Tensor,
speaker_id: int = None,
style_mel: torch.Tensor = None,
style_text: str = None,
d_vector: torch.Tensor = None,
language_id: torch.Tensor = None,
) -> Dict:
"""Run a torch model for inference. It does not support batch inference.
Args:
model (nn.Module): The model to run inference.
inputs (torch.Tensor): Input tensor with character ids.
speaker_id (int, optional): Input speaker ids for multi-speaker models. Defaults to None.
style_mel (torch.Tensor, optional): Spectrograms used for voice styling . Defaults to None.
d_vector (torch.Tensor, optional): d-vector for multi-speaker models . Defaults to None.
Returns:
Dict: model outputs.
"""
input_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device)
if hasattr(model, "module"):
_func = model.module.inference
else:
_func = model.inference
outputs = _func(
inputs,
aux_input={
"x_lengths": input_lengths,
"speaker_ids": speaker_id,
"d_vectors": d_vector,
"style_mel": style_mel,
"style_text": style_text,
"language_ids": language_id,
},
)
return outputs
def trim_silence(wav, ap):
return wav[: ap.find_endpoint(wav)]
def inv_spectrogram(postnet_output, ap, CONFIG):
if CONFIG.model.lower() in ["tacotron"]:
wav = ap.inv_spectrogram(postnet_output.T)
else:
wav = ap.inv_melspectrogram(postnet_output.T)
return wav
def id_to_torch(aux_id, cuda=False, device="cpu"):
if cuda:
device = "cuda"
if aux_id is not None:
aux_id = np.asarray(aux_id)
aux_id = torch.from_numpy(aux_id).to(device)
return aux_id
def embedding_to_torch(d_vector, cuda=False, device="cpu"):
if cuda:
device = "cuda"
if d_vector is not None:
d_vector = np.asarray(d_vector)
d_vector = torch.from_numpy(d_vector).type(torch.FloatTensor)
d_vector = d_vector.squeeze().unsqueeze(0).to(device)
return d_vector
# TODO: perform GL with pytorch for batching
def apply_griffin_lim(inputs, input_lens, CONFIG, ap):
"""Apply griffin-lim to each sample iterating throught the first dimension.
Args:
inputs (Tensor or np.Array): Features to be converted by GL. First dimension is the batch size.
input_lens (Tensor or np.Array): 1D array of sample lengths.
CONFIG (Dict): TTS config.
ap (AudioProcessor): TTS audio processor.
"""
wavs = []
for idx, spec in enumerate(inputs):
wav_len = (input_lens[idx] * ap.hop_length) - ap.hop_length # inverse librosa padding
wav = inv_spectrogram(spec, ap, CONFIG)
# assert len(wav) == wav_len, f" [!] wav lenght: {len(wav)} vs expected: {wav_len}"
wavs.append(wav[:wav_len])
return wavs
def synthesis(
model,
text,
CONFIG,
use_cuda,
speaker_id=None,
style_wav=None,
style_text=None,
use_griffin_lim=False,
do_trim_silence=False,
d_vector=None,
language_id=None,
):
"""Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to
the vocoder model.
Args:
model (TTS.tts.models):
The TTS model to synthesize audio with.
text (str):
The input text to convert to speech.
CONFIG (Coqpit):
Model configuration.
use_cuda (bool):
Enable/disable CUDA.
speaker_id (int):
Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.
style_wav (str | Dict[str, float]):
Path or tensor to/of a waveform used for computing the style embedding based on GST or Capacitron.
Defaults to None, meaning that Capacitron models will sample from the prior distribution to
generate random but realistic prosody.
style_text (str):
Transcription of style_wav for Capacitron models. Defaults to None.
enable_eos_bos_chars (bool):
enable special chars for end of sentence and start of sentence. Defaults to False.
do_trim_silence (bool):
trim silence after synthesis. Defaults to False.
d_vector (torch.Tensor):
d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None.
language_id (int):
Language ID passed to the language embedding layer in multi-langual model. Defaults to None.
"""
# device
device = next(model.parameters()).device
if use_cuda:
device = "cuda"
# GST or Capacitron processing
# TODO: need to handle the case of setting both gst and capacitron to true somewhere
style_mel = None
if CONFIG.has("gst") and CONFIG.gst and style_wav is not None:
if isinstance(style_wav, dict):
style_mel = style_wav
else:
style_mel = compute_style_mel(style_wav, model.ap, device=device)
if CONFIG.has("capacitron_vae") and CONFIG.use_capacitron_vae and style_wav is not None:
style_mel = compute_style_mel(style_wav, model.ap, device=device)
style_mel = style_mel.transpose(1, 2) # [1, time, depth]
language_name = None
if language_id is not None:
language = [k for k, v in model.language_manager.name_to_id.items() if v == language_id]
assert len(language) == 1, "language_id must be a valid language"
language_name = language[0]
# convert text to sequence of token IDs
text_inputs = np.asarray(
model.tokenizer.text_to_ids(text, language=language_name),
dtype=np.int32,
)
# pass tensors to backend
if speaker_id is not None:
speaker_id = id_to_torch(speaker_id, device=device)
if d_vector is not None:
d_vector = embedding_to_torch(d_vector, device=device)
if language_id is not None:
language_id = id_to_torch(language_id, device=device)
if not isinstance(style_mel, dict):
# GST or Capacitron style mel
style_mel = numpy_to_torch(style_mel, torch.float, device=device)
if style_text is not None:
style_text = np.asarray(
model.tokenizer.text_to_ids(style_text, language=language_id),
dtype=np.int32,
)
style_text = numpy_to_torch(style_text, torch.long, device=device)
style_text = style_text.unsqueeze(0)
text_inputs = numpy_to_torch(text_inputs, torch.long, device=device)
text_inputs = text_inputs.unsqueeze(0)
# synthesize voice
outputs = run_model_torch(
model,
text_inputs,
speaker_id,
style_mel,
style_text,
d_vector=d_vector,
language_id=language_id,
)
model_outputs = outputs["model_outputs"]
model_outputs = model_outputs[0].data.cpu().numpy()
alignments = outputs["alignments"]
# convert outputs to numpy
# plot results
wav = None
model_outputs = model_outputs.squeeze()
if model_outputs.ndim == 2: # [T, C_spec]
if use_griffin_lim:
wav = inv_spectrogram(model_outputs, model.ap, CONFIG)
# trim silence
if do_trim_silence:
wav = trim_silence(wav, model.ap)
else: # [T,]
wav = model_outputs
return_dict = {
"wav": wav,
"alignments": alignments,
"text_inputs": text_inputs,
"outputs": outputs,
}
return return_dict
def transfer_voice(
model,
CONFIG,
use_cuda,
reference_wav,
speaker_id=None,
d_vector=None,
reference_speaker_id=None,
reference_d_vector=None,
do_trim_silence=False,
use_griffin_lim=False,
):
"""Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to
the vocoder model.
Args:
model (TTS.tts.models):
The TTS model to synthesize audio with.
CONFIG (Coqpit):
Model configuration.
use_cuda (bool):
Enable/disable CUDA.
reference_wav (str):
Path of reference_wav to be used to voice conversion.
speaker_id (int):
Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.
d_vector (torch.Tensor):
d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None.
reference_speaker_id (int):
Reference Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.
reference_d_vector (torch.Tensor):
Reference d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None.
enable_eos_bos_chars (bool):
enable special chars for end of sentence and start of sentence. Defaults to False.
do_trim_silence (bool):
trim silence after synthesis. Defaults to False.
"""
# device
device = next(model.parameters()).device
if use_cuda:
device = "cuda"
# pass tensors to backend
if speaker_id is not None:
speaker_id = id_to_torch(speaker_id, device=device)
if d_vector is not None:
d_vector = embedding_to_torch(d_vector, device=device)
if reference_d_vector is not None:
reference_d_vector = embedding_to_torch(reference_d_vector, device=device)
# load reference_wav audio
reference_wav = embedding_to_torch(
model.ap.load_wav(
reference_wav, sr=model.args.encoder_sample_rate if model.args.encoder_sample_rate else model.ap.sample_rate
),
device=device,
)
if hasattr(model, "module"):
_func = model.module.inference_voice_conversion
else:
_func = model.inference_voice_conversion
model_outputs = _func(reference_wav, speaker_id, d_vector, reference_speaker_id, reference_d_vector)
# convert outputs to numpy
# plot results
wav = None
model_outputs = model_outputs.squeeze()
if model_outputs.ndim == 2: # [T, C_spec]
if use_griffin_lim:
wav = inv_spectrogram(model_outputs, model.ap, CONFIG)
# trim silence
if do_trim_silence:
wav = trim_silence(wav, model.ap)
else: # [T,]
wav = model_outputs
return wav
+1
View File
@@ -0,0 +1 @@
from TTS.tts.utils.text.tokenizer import TTSTokenizer
+121
View File
@@ -0,0 +1,121 @@
import re
import bangla
from bnnumerizer import numerize
from bnunicodenormalizer import Normalizer
# initialize
bnorm = Normalizer()
attribution_dict = {
"সাঃ": "সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম",
"আঃ": "আলাইহিস সালাম",
"রাঃ": "রাদিআল্লাহু আনহু",
"রহঃ": "রহমাতুল্লাহি আলাইহি",
"রহিঃ": "রহিমাহুল্লাহ",
"হাফিঃ": "হাফিযাহুল্লাহ",
"বায়ান": "বাইআন",
"দাঃবাঃ": "দামাত বারাকাতুহুম,দামাত বারাকাতুল্লাহ",
# "আয়াত" : "আইআত",#আইআত
# "ওয়া" : "ওআ",
# "ওয়াসাল্লাম" : "ওআসাল্লাম",
# "কেন" : "কেনো",
# "কোন" : "কোনো",
# "বল" : "বলো",
# "চল" : "চলো",
# "কর" : "করো",
# "রাখ" : "রাখো",
"": "",
"": "",
# "য়" : "অ",
# "সম্প্রদায়" : "সম্প্রদাই",
# "রয়েছে" : "রইছে",
# "রয়েছ" : "রইছ",
"/": " বাই ",
}
def tag_text(text: str):
# remove multiple spaces
text = re.sub(" +", " ", text)
# create start and end
text = "start" + text + "end"
# tag text
parts = re.split("[\u0600-\u06FF]+", text)
# remove non chars
parts = [p for p in parts if p.strip()]
# unique parts
parts = set(parts)
# tag the text
for m in parts:
if len(m.strip()) > 1:
text = text.replace(m, f"{m}")
# clean-tags
text = text.replace("start", "")
text = text.replace("end", "")
return text
def normalize(sen):
global bnorm # pylint: disable=global-statement
_words = [bnorm(word)["normalized"] for word in sen.split()]
return " ".join([word for word in _words if word is not None])
def expand_full_attribution(text):
for word, attr in attribution_dict.items():
if word in text:
text = text.replace(word, normalize(attr))
return text
def collapse_whitespace(text):
# Regular expression matching whitespace:
_whitespace_re = re.compile(r"\s+")
return re.sub(_whitespace_re, " ", text)
def bangla_text_to_phonemes(text: str) -> str:
# english numbers to bangla conversion
res = re.search("[0-9]", text)
if res is not None:
text = bangla.convert_english_digit_to_bangla_digit(text)
# replace ':' in between two bangla numbers with ' এর '
pattern = r"[, ১, ২, ৩, , ৫, ৬, , ৮, ৯]:[, ১, ২, ৩, , ৫, ৬, , ৮, ৯]"
matches = re.findall(pattern, text)
for m in matches:
r = m.replace(":", " এর ")
text = text.replace(m, r)
# numerize text
text = numerize(text)
# tag sections
text = tag_text(text)
# text blocks
# blocks = text.split("")
# blocks = [b for b in blocks if b.strip()]
# create tuple of (lang,text)
if "" in text:
text = text.replace("", "").replace("", "")
# Split based on sentence ending Characters
bn_text = text.strip()
sentenceEnders = re.compile("[।!?]")
sentences = sentenceEnders.split(str(bn_text))
data = ""
for sent in sentences:
res = re.sub("\n", "", sent)
res = normalize(res)
# expand attributes
res = expand_full_attribution(res)
res = collapse_whitespace(res)
res += ""
data += res
return data
@@ -0,0 +1,37 @@
import os
finder = None
def init():
try:
import jpype
import jpype.imports
except ModuleNotFoundError:
raise ModuleNotFoundError(
"Belarusian phonemizer requires to install module 'jpype1' manually. Try `pip install jpype1`."
)
try:
jar_path = os.environ["BEL_FANETYKA_JAR"]
except KeyError:
raise KeyError("You need to define 'BEL_FANETYKA_JAR' environment variable as path to the fanetyka.jar file")
jpype.startJVM(classpath=[jar_path])
# import the Java modules
from org.alex73.korpus.base import GrammarDB2, GrammarFinder
grammar_db = GrammarDB2.initializeFromJar()
global finder
finder = GrammarFinder(grammar_db)
def belarusian_text_to_phonemes(text: str) -> str:
# Initialize only on first run
if finder is None:
init()
from org.alex73.fanetyka.impl import FanetykaText
return str(FanetykaText(finder, text).ipa)
+501
View File
@@ -0,0 +1,501 @@
from dataclasses import replace
from typing import Dict
from TTS.tts.configs.shared_configs import CharactersConfig
def parse_symbols():
return {
"pad": _pad,
"eos": _eos,
"bos": _bos,
"characters": _characters,
"punctuations": _punctuations,
"phonemes": _phonemes,
}
# DEFAULT SET OF GRAPHEMES
_pad = "<PAD>"
_eos = "<EOS>"
_bos = "<BOS>"
_blank = "<BLNK>" # TODO: check if we need this alongside with PAD
_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
_punctuations = "!'(),-.:;? "
# DEFAULT SET OF IPA PHONEMES
# Phonemes definition (All IPA characters)
_vowels = "iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ"
_non_pulmonic_consonants = "ʘɓǀɗǃʄǂɠǁʛ"
_pulmonic_consonants = "pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ"
_suprasegmentals = "ˈˌːˑ"
_other_symbols = "ʍwɥʜʢʡɕʑɺɧʲ"
_diacrilics = "ɚ˞ɫ"
_phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics
class BaseVocabulary:
"""Base Vocabulary class.
This class only needs a vocabulary dictionary without specifying the characters.
Args:
vocab (Dict): A dictionary of characters and their corresponding indices.
"""
def __init__(self, vocab: Dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None):
self.vocab = vocab
self.pad = pad
self.blank = blank
self.bos = bos
self.eos = eos
@property
def pad_id(self) -> int:
"""Return the index of the padding character. If the padding character is not specified, return the length
of the vocabulary."""
return self.char_to_id(self.pad) if self.pad else len(self.vocab)
@property
def blank_id(self) -> int:
"""Return the index of the blank character. If the blank character is not specified, return the length of
the vocabulary."""
return self.char_to_id(self.blank) if self.blank else len(self.vocab)
@property
def bos_id(self) -> int:
"""Return the index of the bos character. If the bos character is not specified, return the length of the
vocabulary."""
return self.char_to_id(self.bos) if self.bos else len(self.vocab)
@property
def eos_id(self) -> int:
"""Return the index of the eos character. If the eos character is not specified, return the length of the
vocabulary."""
return self.char_to_id(self.eos) if self.eos else len(self.vocab)
@property
def vocab(self):
"""Return the vocabulary dictionary."""
return self._vocab
@vocab.setter
def vocab(self, vocab):
"""Set the vocabulary dictionary and character mapping dictionaries."""
self._vocab, self._char_to_id, self._id_to_char = None, None, None
if vocab is not None:
self._vocab = vocab
self._char_to_id = {char: idx for idx, char in enumerate(self._vocab)}
self._id_to_char = {
idx: char for idx, char in enumerate(self._vocab) # pylint: disable=unnecessary-comprehension
}
@staticmethod
def init_from_config(config, **kwargs):
"""Initialize from the given config."""
if config.characters is not None and "vocab_dict" in config.characters and config.characters.vocab_dict:
return (
BaseVocabulary(
config.characters.vocab_dict,
config.characters.pad,
config.characters.blank,
config.characters.bos,
config.characters.eos,
),
config,
)
return BaseVocabulary(**kwargs), config
def to_config(self) -> "CharactersConfig":
return CharactersConfig(
vocab_dict=self._vocab,
pad=self.pad,
eos=self.eos,
bos=self.bos,
blank=self.blank,
is_unique=False,
is_sorted=False,
)
@property
def num_chars(self):
"""Return number of tokens in the vocabulary."""
return len(self._vocab)
def char_to_id(self, char: str) -> int:
"""Map a character to an token ID."""
try:
return self._char_to_id[char]
except KeyError as e:
raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
def id_to_char(self, idx: int) -> str:
"""Map an token ID to a character."""
return self._id_to_char[idx]
class BaseCharacters:
"""🐸BaseCharacters class
Every new character class should inherit from this.
Characters are oredered as follows ```[PAD, EOS, BOS, BLANK, CHARACTERS, PUNCTUATIONS]```.
If you need a custom order, you need to define inherit from this class and override the ```_create_vocab``` method.
Args:
characters (str):
Main set of characters to be used in the vocabulary.
punctuations (str):
Characters to be treated as punctuation.
pad (str):
Special padding character that would be ignored by the model.
eos (str):
End of the sentence character.
bos (str):
Beginning of the sentence character.
blank (str):
Optional character used between characters by some models for better prosody.
is_unique (bool):
Remove duplicates from the provided characters. Defaults to True.
el
is_sorted (bool):
Sort the characters in alphabetical order. Only applies to `self.characters`. Defaults to True.
"""
def __init__(
self,
characters: str = None,
punctuations: str = None,
pad: str = None,
eos: str = None,
bos: str = None,
blank: str = None,
is_unique: bool = False,
is_sorted: bool = True,
) -> None:
self._characters = characters
self._punctuations = punctuations
self._pad = pad
self._eos = eos
self._bos = bos
self._blank = blank
self.is_unique = is_unique
self.is_sorted = is_sorted
self._create_vocab()
@property
def pad_id(self) -> int:
return self.char_to_id(self.pad) if self.pad else len(self.vocab)
@property
def blank_id(self) -> int:
return self.char_to_id(self.blank) if self.blank else len(self.vocab)
@property
def eos_id(self) -> int:
return self.char_to_id(self.eos) if self.eos else len(self.vocab)
@property
def bos_id(self) -> int:
return self.char_to_id(self.bos) if self.bos else len(self.vocab)
@property
def characters(self):
return self._characters
@characters.setter
def characters(self, characters):
self._characters = characters
self._create_vocab()
@property
def punctuations(self):
return self._punctuations
@punctuations.setter
def punctuations(self, punctuations):
self._punctuations = punctuations
self._create_vocab()
@property
def pad(self):
return self._pad
@pad.setter
def pad(self, pad):
self._pad = pad
self._create_vocab()
@property
def eos(self):
return self._eos
@eos.setter
def eos(self, eos):
self._eos = eos
self._create_vocab()
@property
def bos(self):
return self._bos
@bos.setter
def bos(self, bos):
self._bos = bos
self._create_vocab()
@property
def blank(self):
return self._blank
@blank.setter
def blank(self, blank):
self._blank = blank
self._create_vocab()
@property
def vocab(self):
return self._vocab
@vocab.setter
def vocab(self, vocab):
self._vocab = vocab
self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
self._id_to_char = {
idx: char for idx, char in enumerate(self.vocab) # pylint: disable=unnecessary-comprehension
}
@property
def num_chars(self):
return len(self._vocab)
def _create_vocab(self):
_vocab = self._characters
if self.is_unique:
_vocab = list(set(_vocab))
if self.is_sorted:
_vocab = sorted(_vocab)
_vocab = list(_vocab)
_vocab = [self._blank] + _vocab if self._blank is not None and len(self._blank) > 0 else _vocab
_vocab = [self._bos] + _vocab if self._bos is not None and len(self._bos) > 0 else _vocab
_vocab = [self._eos] + _vocab if self._eos is not None and len(self._eos) > 0 else _vocab
_vocab = [self._pad] + _vocab if self._pad is not None and len(self._pad) > 0 else _vocab
self.vocab = _vocab + list(self._punctuations)
if self.is_unique:
duplicates = {x for x in self.vocab if self.vocab.count(x) > 1}
assert (
len(self.vocab) == len(self._char_to_id) == len(self._id_to_char)
), f" [!] There are duplicate characters in the character set. {duplicates}"
def char_to_id(self, char: str) -> int:
try:
return self._char_to_id[char]
except KeyError as e:
raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
def id_to_char(self, idx: int) -> str:
return self._id_to_char[idx]
def print_log(self, level: int = 0):
"""
Prints the vocabulary in a nice format.
"""
indent = "\t" * level
print(f"{indent}| > Characters: {self._characters}")
print(f"{indent}| > Punctuations: {self._punctuations}")
print(f"{indent}| > Pad: {self._pad}")
print(f"{indent}| > EOS: {self._eos}")
print(f"{indent}| > BOS: {self._bos}")
print(f"{indent}| > Blank: {self._blank}")
print(f"{indent}| > Vocab: {self.vocab}")
print(f"{indent}| > Num chars: {self.num_chars}")
@staticmethod
def init_from_config(config: "Coqpit"): # pylint: disable=unused-argument
"""Init your character class from a config.
Implement this method for your subclass.
"""
# use character set from config
if config.characters is not None:
return BaseCharacters(**config.characters), config
# return default character set
characters = BaseCharacters()
new_config = replace(config, characters=characters.to_config())
return characters, new_config
def to_config(self) -> "CharactersConfig":
return CharactersConfig(
characters=self._characters,
punctuations=self._punctuations,
pad=self._pad,
eos=self._eos,
bos=self._bos,
blank=self._blank,
is_unique=self.is_unique,
is_sorted=self.is_sorted,
)
class IPAPhonemes(BaseCharacters):
"""🐸IPAPhonemes class to manage `TTS.tts` model vocabulary
Intended to be used with models using IPAPhonemes as input.
It uses system defaults for the undefined class arguments.
Args:
characters (str):
Main set of case-sensitive characters to be used in the vocabulary. Defaults to `_phonemes`.
punctuations (str):
Characters to be treated as punctuation. Defaults to `_punctuations`.
pad (str):
Special padding character that would be ignored by the model. Defaults to `_pad`.
eos (str):
End of the sentence character. Defaults to `_eos`.
bos (str):
Beginning of the sentence character. Defaults to `_bos`.
blank (str):
Optional character used between characters by some models for better prosody. Defaults to `_blank`.
is_unique (bool):
Remove duplicates from the provided characters. Defaults to True.
is_sorted (bool):
Sort the characters in alphabetical order. Defaults to True.
"""
def __init__(
self,
characters: str = _phonemes,
punctuations: str = _punctuations,
pad: str = _pad,
eos: str = _eos,
bos: str = _bos,
blank: str = _blank,
is_unique: bool = False,
is_sorted: bool = True,
) -> None:
super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
@staticmethod
def init_from_config(config: "Coqpit"):
"""Init a IPAPhonemes object from a model config
If characters are not defined in the config, it will be set to the default characters and the config
will be updated.
"""
# band-aid for compatibility with old models
if "characters" in config and config.characters is not None:
if "phonemes" in config.characters and config.characters.phonemes is not None:
config.characters["characters"] = config.characters["phonemes"]
return (
IPAPhonemes(
characters=config.characters["characters"],
punctuations=config.characters["punctuations"],
pad=config.characters["pad"],
eos=config.characters["eos"],
bos=config.characters["bos"],
blank=config.characters["blank"],
is_unique=config.characters["is_unique"],
is_sorted=config.characters["is_sorted"],
),
config,
)
# use character set from config
if config.characters is not None:
return IPAPhonemes(**config.characters), config
# return default character set
characters = IPAPhonemes()
new_config = replace(config, characters=characters.to_config())
return characters, new_config
class Graphemes(BaseCharacters):
"""🐸Graphemes class to manage `TTS.tts` model vocabulary
Intended to be used with models using graphemes as input.
It uses system defaults for the undefined class arguments.
Args:
characters (str):
Main set of case-sensitive characters to be used in the vocabulary. Defaults to `_characters`.
punctuations (str):
Characters to be treated as punctuation. Defaults to `_punctuations`.
pad (str):
Special padding character that would be ignored by the model. Defaults to `_pad`.
eos (str):
End of the sentence character. Defaults to `_eos`.
bos (str):
Beginning of the sentence character. Defaults to `_bos`.
is_unique (bool):
Remove duplicates from the provided characters. Defaults to True.
is_sorted (bool):
Sort the characters in alphabetical order. Defaults to True.
"""
def __init__(
self,
characters: str = _characters,
punctuations: str = _punctuations,
pad: str = _pad,
eos: str = _eos,
bos: str = _bos,
blank: str = _blank,
is_unique: bool = False,
is_sorted: bool = True,
) -> None:
super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
@staticmethod
def init_from_config(config: "Coqpit"):
"""Init a Graphemes object from a model config
If characters are not defined in the config, it will be set to the default characters and the config
will be updated.
"""
if config.characters is not None:
# band-aid for compatibility with old models
if "phonemes" in config.characters:
return (
Graphemes(
characters=config.characters["characters"],
punctuations=config.characters["punctuations"],
pad=config.characters["pad"],
eos=config.characters["eos"],
bos=config.characters["bos"],
blank=config.characters["blank"],
is_unique=config.characters["is_unique"],
is_sorted=config.characters["is_sorted"],
),
config,
)
return Graphemes(**config.characters), config
characters = Graphemes()
new_config = replace(config, characters=characters.to_config())
return characters, new_config
if __name__ == "__main__":
gr = Graphemes()
ph = IPAPhonemes()
gr.print_log()
ph.print_log()
@@ -0,0 +1,127 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Licensed under WTFPL or the Unlicense or CC0.
# This uses Python 3, but it's easy to port to Python 2 by changing
# strings to u'xx'.
import itertools
import re
def _num2chinese(num: str, big=False, simp=True, o=False, twoalt=False) -> str:
"""Convert numerical arabic numbers (0->9) to chinese hanzi numbers ( -> 九)
Args:
num (str): arabic number to convert
big (bool, optional): use financial characters. Defaults to False.
simp (bool, optional): use simplified characters instead of tradictional characters. Defaults to True.
o (bool, optional): use for 'zero'. Defaults to False.
twoalt (bool, optional): use 两/兩 for 'two' when appropriate. Defaults to False.
Raises:
ValueError: if number is more than 1e48
ValueError: if 'e' exposent in number
Returns:
str: converted number as hanzi characters
"""
# check num first
nd = str(num)
if abs(float(nd)) >= 1e48:
raise ValueError("number out of range")
if "e" in nd:
raise ValueError("scientific notation is not supported")
c_symbol = "正负点" if simp else "正負點"
if o: # formal
twoalt = False
if big:
c_basic = "零壹贰叁肆伍陆柒捌玖" if simp else "零壹貳參肆伍陸柒捌玖"
c_unit1 = "拾佰仟"
c_twoalt = "" if simp else ""
else:
c_basic = "〇一二三四五六七八九" if o else "零一二三四五六七八九"
c_unit1 = "十百千"
if twoalt:
c_twoalt = "" if simp else ""
else:
c_twoalt = ""
c_unit2 = "万亿兆京垓秭穰沟涧正载" if simp else "萬億兆京垓秭穰溝澗正載"
revuniq = lambda l: "".join(k for k, g in itertools.groupby(reversed(l)))
nd = str(num)
result = []
if nd[0] == "+":
result.append(c_symbol[0])
elif nd[0] == "-":
result.append(c_symbol[1])
if "." in nd:
integer, remainder = nd.lstrip("+-").split(".")
else:
integer, remainder = nd.lstrip("+-"), None
if int(integer):
splitted = [integer[max(i - 4, 0) : i] for i in range(len(integer), 0, -4)]
intresult = []
for nu, unit in enumerate(splitted):
# special cases
if int(unit) == 0: # 0000
intresult.append(c_basic[0])
continue
if nu > 0 and int(unit) == 2: # 0002
intresult.append(c_twoalt + c_unit2[nu - 1])
continue
ulist = []
unit = unit.zfill(4)
for nc, ch in enumerate(reversed(unit)):
if ch == "0":
if ulist: # ???0
ulist.append(c_basic[0])
elif nc == 0:
ulist.append(c_basic[int(ch)])
elif nc == 1 and ch == "1" and unit[1] == "0":
# special case for tens
# edit the 'elif' if you don't like
# 十四, 三千零十四, 三千三百一十四
ulist.append(c_unit1[0])
elif nc > 1 and ch == "2":
ulist.append(c_twoalt + c_unit1[nc - 1])
else:
ulist.append(c_basic[int(ch)] + c_unit1[nc - 1])
ustr = revuniq(ulist)
if nu == 0:
intresult.append(ustr)
else:
intresult.append(ustr + c_unit2[nu - 1])
result.append(revuniq(intresult).strip(c_basic[0]))
else:
result.append(c_basic[0])
if remainder:
result.append(c_symbol[2])
result.append("".join(c_basic[int(ch)] for ch in remainder))
return "".join(result)
def _number_replace(match) -> str:
"""function to apply in a match, transform all numbers in a match by chinese characters
Args:
match (re.Match): numbers regex matches
Returns:
str: replaced characters for the numbers
"""
match_str: str = match.group()
return _num2chinese(match_str)
def replace_numbers_to_characters_in_text(text: str) -> str:
"""Replace all arabic numbers in a text by their equivalent in chinese characters (simplified)
Args:
text (str): input text to transform
Returns:
str: output text
"""
text = re.sub(r"[0-9]+", _number_replace, text)
return text
@@ -0,0 +1,37 @@
from typing import List
import jieba
import pypinyin
from .pinyinToPhonemes import PINYIN_DICT
def _chinese_character_to_pinyin(text: str) -> List[str]:
pinyins = pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)
pinyins_flat_list = [item for sublist in pinyins for item in sublist]
return pinyins_flat_list
def _chinese_pinyin_to_phoneme(pinyin: str) -> str:
segment = pinyin[:-1]
tone = pinyin[-1]
phoneme = PINYIN_DICT.get(segment, [""])[0]
return phoneme + tone
def chinese_text_to_phonemes(text: str, seperator: str = "|") -> str:
tokenized_text = jieba.cut(text, HMM=False)
tokenized_text = " ".join(tokenized_text)
pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text)
results: List[str] = []
for token in pinyined_text:
if token[-1] in "12345": # TODO transform to is_pinyin()
pinyin_phonemes = _chinese_pinyin_to_phoneme(token)
results += list(pinyin_phonemes)
else: # is ponctuation or other
results += list(token)
return seperator.join(results)
@@ -0,0 +1,419 @@
PINYIN_DICT = {
"a": ["a"],
"ai": ["ai"],
"an": ["an"],
"ang": ["ɑŋ"],
"ao": [""],
"ba": ["ba"],
"bai": ["bai"],
"ban": ["ban"],
"bang": ["bɑŋ"],
"bao": ["baʌ"],
# "be": ["be"], doesnt exist
"bei": ["bɛi"],
"ben": ["bœn"],
"beng": ["bɵŋ"],
"bi": ["bi"],
"bian": ["biɛn"],
"biao": ["biaʌ"],
"bie": ["bie"],
"bin": ["bin"],
"bing": ["bɨŋ"],
"bo": ["bo"],
"bu": ["bu"],
"ca": ["tsa"],
"cai": ["tsai"],
"can": ["tsan"],
"cang": ["tsɑŋ"],
"cao": ["tsaʌ"],
"ce": ["tsø"],
"cen": ["tsœn"],
"ceng": ["tsɵŋ"],
"cha": ["ʈʂa"],
"chai": ["ʈʂai"],
"chan": ["ʈʂan"],
"chang": ["ʈʂɑŋ"],
"chao": ["ʈʂaʌ"],
"che": ["ʈʂø"],
"chen": ["ʈʂœn"],
"cheng": ["ʈʂɵŋ"],
"chi": ["ʈʂʏ"],
"chong": ["ʈʂoŋ"],
"chou": ["ʈʂou"],
"chu": ["ʈʂu"],
"chua": ["ʈʂua"],
"chuai": ["ʈʂuai"],
"chuan": ["ʈʂuan"],
"chuang": ["ʈʂuɑŋ"],
"chui": ["ʈʂuei"],
"chun": ["ʈʂun"],
"chuo": ["ʈʂuo"],
"ci": ["tsɪ"],
"cong": ["tsoŋ"],
"cou": ["tsou"],
"cu": ["tsu"],
"cuan": ["tsuan"],
"cui": ["tsuei"],
"cun": ["tsun"],
"cuo": ["tsuo"],
"da": ["da"],
"dai": ["dai"],
"dan": ["dan"],
"dang": ["dɑŋ"],
"dao": ["daʌ"],
"de": [""],
"dei": ["dei"],
# "den": ["dœn"],
"deng": ["dɵŋ"],
"di": ["di"],
"dia": ["dia"],
"dian": ["diɛn"],
"diao": ["diaʌ"],
"die": ["die"],
"ding": ["dɨŋ"],
"diu": ["dio"],
"dong": ["doŋ"],
"dou": ["dou"],
"du": ["du"],
"duan": ["duan"],
"dui": ["duei"],
"dun": ["dun"],
"duo": ["duo"],
"e": ["ø"],
"ei": ["ei"],
"en": ["œn"],
# "ng": ["œn"],
# "eng": ["ɵŋ"],
"er": ["er"],
"fa": ["fa"],
"fan": ["fan"],
"fang": ["fɑŋ"],
"fei": ["fei"],
"fen": ["fœn"],
"feng": ["fɵŋ"],
"fo": ["fo"],
"fou": ["fou"],
"fu": ["fu"],
"ga": ["ga"],
"gai": ["gai"],
"gan": ["gan"],
"gang": ["gɑŋ"],
"gao": ["gaʌ"],
"ge": [""],
"gei": ["gei"],
"gen": ["gœn"],
"geng": ["gɵŋ"],
"gong": ["goŋ"],
"gou": ["gou"],
"gu": ["gu"],
"gua": ["gua"],
"guai": ["guai"],
"guan": ["guan"],
"guang": ["guɑŋ"],
"gui": ["guei"],
"gun": ["gun"],
"guo": ["guo"],
"ha": ["xa"],
"hai": ["xai"],
"han": ["xan"],
"hang": ["xɑŋ"],
"hao": ["xaʌ"],
"he": [""],
"hei": ["xei"],
"hen": ["xœn"],
"heng": ["xɵŋ"],
"hong": ["xoŋ"],
"hou": ["xou"],
"hu": ["xu"],
"hua": ["xua"],
"huai": ["xuai"],
"huan": ["xuan"],
"huang": ["xuɑŋ"],
"hui": ["xuei"],
"hun": ["xun"],
"huo": ["xuo"],
"ji": ["dʑi"],
"jia": ["dʑia"],
"jian": ["dʑiɛn"],
"jiang": ["dʑiɑŋ"],
"jiao": ["dʑiaʌ"],
"jie": ["dʑie"],
"jin": ["dʑin"],
"jing": ["dʑɨŋ"],
"jiong": ["dʑioŋ"],
"jiu": ["dʑio"],
"ju": ["dʑy"],
"juan": ["dʑyɛn"],
"jue": ["dʑye"],
"jun": ["dʑyn"],
"ka": ["ka"],
"kai": ["kai"],
"kan": ["kan"],
"kang": ["kɑŋ"],
"kao": ["kaʌ"],
"ke": [""],
"kei": ["kei"],
"ken": ["kœn"],
"keng": ["kɵŋ"],
"kong": ["koŋ"],
"kou": ["kou"],
"ku": ["ku"],
"kua": ["kua"],
"kuai": ["kuai"],
"kuan": ["kuan"],
"kuang": ["kuɑŋ"],
"kui": ["kuei"],
"kun": ["kun"],
"kuo": ["kuo"],
"la": ["la"],
"lai": ["lai"],
"lan": ["lan"],
"lang": ["lɑŋ"],
"lao": ["laʌ"],
"le": [""],
"lei": ["lei"],
"leng": ["lɵŋ"],
"li": ["li"],
"lia": ["lia"],
"lian": ["liɛn"],
"liang": ["liɑŋ"],
"liao": ["liaʌ"],
"lie": ["lie"],
"lin": ["lin"],
"ling": ["lɨŋ"],
"liu": ["lio"],
"lo": ["lo"],
"long": ["loŋ"],
"lou": ["lou"],
"lu": ["lu"],
"lv": ["ly"],
"luan": ["luan"],
"lve": ["lye"],
"lue": ["lue"],
"lun": ["lun"],
"luo": ["luo"],
"ma": ["ma"],
"mai": ["mai"],
"man": ["man"],
"mang": ["mɑŋ"],
"mao": ["maʌ"],
"me": [""],
"mei": ["mei"],
"men": ["mœn"],
"meng": ["mɵŋ"],
"mi": ["mi"],
"mian": ["miɛn"],
"miao": ["miaʌ"],
"mie": ["mie"],
"min": ["min"],
"ming": ["mɨŋ"],
"miu": ["mio"],
"mo": ["mo"],
"mou": ["mou"],
"mu": ["mu"],
"na": ["na"],
"nai": ["nai"],
"nan": ["nan"],
"nang": ["nɑŋ"],
"nao": ["naʌ"],
"ne": [""],
"nei": ["nei"],
"nen": ["nœn"],
"neng": ["nɵŋ"],
"ni": ["ni"],
"nia": ["nia"],
"nian": ["niɛn"],
"niang": ["niɑŋ"],
"niao": ["niaʌ"],
"nie": ["nie"],
"nin": ["nin"],
"ning": ["nɨŋ"],
"niu": ["nio"],
"nong": ["noŋ"],
"nou": ["nou"],
"nu": ["nu"],
"nv": ["ny"],
"nuan": ["nuan"],
"nve": ["nye"],
"nue": ["nye"],
"nuo": ["nuo"],
"o": ["o"],
"ou": ["ou"],
"pa": ["pa"],
"pai": ["pai"],
"pan": ["pan"],
"pang": ["pɑŋ"],
"pao": ["paʌ"],
"pe": [""],
"pei": ["pei"],
"pen": ["pœn"],
"peng": ["pɵŋ"],
"pi": ["pi"],
"pian": ["piɛn"],
"piao": ["piaʌ"],
"pie": ["pie"],
"pin": ["pin"],
"ping": ["pɨŋ"],
"po": ["po"],
"pou": ["pou"],
"pu": ["pu"],
"qi": ["tɕi"],
"qia": ["tɕia"],
"qian": ["tɕiɛn"],
"qiang": ["tɕiɑŋ"],
"qiao": ["tɕiaʌ"],
"qie": ["tɕie"],
"qin": ["tɕin"],
"qing": ["tɕɨŋ"],
"qiong": ["tɕioŋ"],
"qiu": ["tɕio"],
"qu": ["tɕy"],
"quan": ["tɕyɛn"],
"que": ["tɕye"],
"qun": ["tɕyn"],
"ran": ["ʐan"],
"rang": ["ʐɑŋ"],
"rao": ["ʐaʌ"],
"re": ["ʐø"],
"ren": ["ʐœn"],
"reng": ["ʐɵŋ"],
"ri": ["ʐʏ"],
"rong": ["ʐoŋ"],
"rou": ["ʐou"],
"ru": ["ʐu"],
"rua": ["ʐua"],
"ruan": ["ʐuan"],
"rui": ["ʐuei"],
"run": ["ʐun"],
"ruo": ["ʐuo"],
"sa": ["sa"],
"sai": ["sai"],
"san": ["san"],
"sang": ["sɑŋ"],
"sao": ["saʌ"],
"se": [""],
"sen": ["sœn"],
"seng": ["sɵŋ"],
"sha": ["ʂa"],
"shai": ["ʂai"],
"shan": ["ʂan"],
"shang": ["ʂɑŋ"],
"shao": ["ʂaʌ"],
"she": ["ʂø"],
"shei": ["ʂei"],
"shen": ["ʂœn"],
"sheng": ["ʂɵŋ"],
"shi": ["ʂʏ"],
"shou": ["ʂou"],
"shu": ["ʂu"],
"shua": ["ʂua"],
"shuai": ["ʂuai"],
"shuan": ["ʂuan"],
"shuang": ["ʂuɑŋ"],
"shui": ["ʂuei"],
"shun": ["ʂun"],
"shuo": ["ʂuo"],
"si": ["sɪ"],
"song": ["soŋ"],
"sou": ["sou"],
"su": ["su"],
"suan": ["suan"],
"sui": ["suei"],
"sun": ["sun"],
"suo": ["suo"],
"ta": ["ta"],
"tai": ["tai"],
"tan": ["tan"],
"tang": ["tɑŋ"],
"tao": ["taʌ"],
"te": [""],
"tei": ["tei"],
"teng": ["tɵŋ"],
"ti": ["ti"],
"tian": ["tiɛn"],
"tiao": ["tiaʌ"],
"tie": ["tie"],
"ting": ["tɨŋ"],
"tong": ["toŋ"],
"tou": ["tou"],
"tu": ["tu"],
"tuan": ["tuan"],
"tui": ["tuei"],
"tun": ["tun"],
"tuo": ["tuo"],
"wa": ["wa"],
"wai": ["wai"],
"wan": ["wan"],
"wang": ["wɑŋ"],
"wei": ["wei"],
"wen": ["wœn"],
"weng": ["wɵŋ"],
"wo": ["wo"],
"wu": ["wu"],
"xi": ["ɕi"],
"xia": ["ɕia"],
"xian": ["ɕiɛn"],
"xiang": ["ɕiɑŋ"],
"xiao": ["ɕiaʌ"],
"xie": ["ɕie"],
"xin": ["ɕin"],
"xing": ["ɕɨŋ"],
"xiong": ["ɕioŋ"],
"xiu": ["ɕio"],
"xu": ["ɕy"],
"xuan": ["ɕyɛn"],
"xue": ["ɕye"],
"xun": ["ɕyn"],
"ya": ["ia"],
"yan": ["iɛn"],
"yang": ["iɑŋ"],
"yao": ["iaʌ"],
"ye": ["ie"],
"yi": ["i"],
"yin": ["in"],
"ying": ["ɨŋ"],
"yo": ["io"],
"yong": ["ioŋ"],
"you": ["io"],
"yu": ["y"],
"yuan": ["yɛn"],
"yue": ["ye"],
"yun": ["yn"],
"za": ["dza"],
"zai": ["dzai"],
"zan": ["dzan"],
"zang": ["dzɑŋ"],
"zao": ["dzaʌ"],
"ze": ["dzø"],
"zei": ["dzei"],
"zen": ["dzœn"],
"zeng": ["dzɵŋ"],
"zha": ["dʒa"],
"zhai": ["dʒai"],
"zhan": ["dʒan"],
"zhang": ["dʒɑŋ"],
"zhao": ["dʒaʌ"],
"zhe": ["dʒø"],
# "zhei": ["dʒei"], it doesn't exist
"zhen": ["dʒœn"],
"zheng": ["dʒɵŋ"],
"zhi": ["dʒʏ"],
"zhong": ["dʒoŋ"],
"zhou": ["dʒou"],
"zhu": ["dʒu"],
"zhua": ["dʒua"],
"zhuai": ["dʒuai"],
"zhuan": ["dʒuan"],
"zhuang": ["dʒuɑŋ"],
"zhui": ["dʒuei"],
"zhun": ["dʒun"],
"zhuo": ["dʒuo"],
"zi": ["dzɪ"],
"zong": ["dzoŋ"],
"zou": ["dzou"],
"zu": ["dzu"],
"zuan": ["dzuan"],
"zui": ["dzuei"],
"zun": ["dzun"],
"zuo": ["dzuo"],
}
+171
View File
@@ -0,0 +1,171 @@
"""Set of default text cleaners"""
# TODO: pick the cleaner for languages dynamically
import re
from anyascii import anyascii
from TTS.tts.utils.text.chinese_mandarin.numbers import replace_numbers_to_characters_in_text
from .english.abbreviations import abbreviations_en
from .english.number_norm import normalize_numbers as en_normalize_numbers
from .english.time_norm import expand_time_english
from .french.abbreviations import abbreviations_fr
# Regular expression matching whitespace:
_whitespace_re = re.compile(r"\s+")
def expand_abbreviations(text, lang="en"):
if lang == "en":
_abbreviations = abbreviations_en
elif lang == "fr":
_abbreviations = abbreviations_fr
for regex, replacement in _abbreviations:
text = re.sub(regex, replacement, text)
return text
def lowercase(text):
return text.lower()
def collapse_whitespace(text):
return re.sub(_whitespace_re, " ", text).strip()
def convert_to_ascii(text):
return anyascii(text)
def remove_aux_symbols(text):
text = re.sub(r"[\<\>\(\)\[\]\"]+", "", text)
return text
def replace_symbols(text, lang="en"):
"""Replace symbols based on the lenguage tag.
Args:
text:
Input text.
lang:
Lenguage identifier. ex: "en", "fr", "pt", "ca".
Returns:
The modified text
example:
input args:
text: "si l'avi cau, diguem-ho"
lang: "ca"
Output:
text: "si lavi cau, diguemho"
"""
text = text.replace(";", ",")
text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
text = text.replace(":", ",")
if lang == "en":
text = text.replace("&", " and ")
elif lang == "fr":
text = text.replace("&", " et ")
elif lang == "pt":
text = text.replace("&", " e ")
elif lang == "ca":
text = text.replace("&", " i ")
text = text.replace("'", "")
return text
def basic_cleaners(text):
"""Basic pipeline that lowercases and collapses whitespace without transliteration."""
text = lowercase(text)
text = collapse_whitespace(text)
return text
def transliteration_cleaners(text):
"""Pipeline for non-English text that transliterates to ASCII."""
# text = convert_to_ascii(text)
text = lowercase(text)
text = collapse_whitespace(text)
return text
def basic_german_cleaners(text):
"""Pipeline for German text"""
text = lowercase(text)
text = collapse_whitespace(text)
return text
# TODO: elaborate it
def basic_turkish_cleaners(text):
"""Pipeline for Turkish text"""
text = text.replace("I", "ı")
text = lowercase(text)
text = collapse_whitespace(text)
return text
def english_cleaners(text):
"""Pipeline for English text, including number and abbreviation expansion."""
# text = convert_to_ascii(text)
text = lowercase(text)
text = expand_time_english(text)
text = en_normalize_numbers(text)
text = expand_abbreviations(text)
text = replace_symbols(text)
text = remove_aux_symbols(text)
text = collapse_whitespace(text)
return text
def phoneme_cleaners(text):
"""Pipeline for phonemes mode, including number and abbreviation expansion."""
text = en_normalize_numbers(text)
text = expand_abbreviations(text)
text = replace_symbols(text)
text = remove_aux_symbols(text)
text = collapse_whitespace(text)
return text
def french_cleaners(text):
"""Pipeline for French text. There is no need to expand numbers, phonemizer already does that"""
text = expand_abbreviations(text, lang="fr")
text = lowercase(text)
text = replace_symbols(text, lang="fr")
text = remove_aux_symbols(text)
text = collapse_whitespace(text)
return text
def portuguese_cleaners(text):
"""Basic pipeline for Portuguese text. There is no need to expand abbreviation and
numbers, phonemizer already does that"""
text = lowercase(text)
text = replace_symbols(text, lang="pt")
text = remove_aux_symbols(text)
text = collapse_whitespace(text)
return text
def chinese_mandarin_cleaners(text: str) -> str:
"""Basic pipeline for chinese"""
text = replace_numbers_to_characters_in_text(text)
return text
def multilingual_cleaners(text):
"""Pipeline for multilingual text"""
text = lowercase(text)
text = replace_symbols(text, lang=None)
text = remove_aux_symbols(text)
text = collapse_whitespace(text)
return text
def no_cleaners(text):
# remove newline characters
text = text.replace("\n", "")
return text
+151
View File
@@ -0,0 +1,151 @@
# -*- coding: utf-8 -*-
import re
VALID_SYMBOLS = [
"AA",
"AA0",
"AA1",
"AA2",
"AE",
"AE0",
"AE1",
"AE2",
"AH",
"AH0",
"AH1",
"AH2",
"AO",
"AO0",
"AO1",
"AO2",
"AW",
"AW0",
"AW1",
"AW2",
"AY",
"AY0",
"AY1",
"AY2",
"B",
"CH",
"D",
"DH",
"EH",
"EH0",
"EH1",
"EH2",
"ER",
"ER0",
"ER1",
"ER2",
"EY",
"EY0",
"EY1",
"EY2",
"F",
"G",
"HH",
"IH",
"IH0",
"IH1",
"IH2",
"IY",
"IY0",
"IY1",
"IY2",
"JH",
"K",
"L",
"M",
"N",
"NG",
"OW",
"OW0",
"OW1",
"OW2",
"OY",
"OY0",
"OY1",
"OY2",
"P",
"R",
"S",
"SH",
"T",
"TH",
"UH",
"UH0",
"UH1",
"UH2",
"UW",
"UW0",
"UW1",
"UW2",
"V",
"W",
"Y",
"Z",
"ZH",
]
class CMUDict:
"""Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
def __init__(self, file_or_path, keep_ambiguous=True):
if isinstance(file_or_path, str):
with open(file_or_path, encoding="latin-1") as f:
entries = _parse_cmudict(f)
else:
entries = _parse_cmudict(file_or_path)
if not keep_ambiguous:
entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
self._entries = entries
def __len__(self):
return len(self._entries)
def lookup(self, word):
"""Returns list of ARPAbet pronunciations of the given word."""
return self._entries.get(word.upper())
@staticmethod
def get_arpabet(word, cmudict, punctuation_symbols):
first_symbol, last_symbol = "", ""
if word and word[0] in punctuation_symbols:
first_symbol = word[0]
word = word[1:]
if word and word[-1] in punctuation_symbols:
last_symbol = word[-1]
word = word[:-1]
arpabet = cmudict.lookup(word)
if arpabet is not None:
return first_symbol + "{%s}" % arpabet[0] + last_symbol
return first_symbol + word + last_symbol
_alt_re = re.compile(r"\([0-9]+\)")
def _parse_cmudict(file):
cmudict = {}
for line in file:
if line and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
parts = line.split(" ")
word = re.sub(_alt_re, "", parts[0])
pronunciation = _get_pronunciation(parts[1])
if pronunciation:
if word in cmudict:
cmudict[word].append(pronunciation)
else:
cmudict[word] = [pronunciation]
return cmudict
def _get_pronunciation(s):
parts = s.strip().split(" ")
for part in parts:
if part not in VALID_SYMBOLS:
return None
return " ".join(parts)
@@ -0,0 +1,26 @@
import re
# List of (regular expression, replacement) pairs for abbreviations in english:
abbreviations_en = [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
("mrs", "misess"),
("mr", "mister"),
("dr", "doctor"),
("st", "saint"),
("co", "company"),
("jr", "junior"),
("maj", "major"),
("gen", "general"),
("drs", "doctors"),
("rev", "reverend"),
("lt", "lieutenant"),
("hon", "honorable"),
("sgt", "sergeant"),
("capt", "captain"),
("esq", "esquire"),
("ltd", "limited"),
("col", "colonel"),
("ft", "fort"),
]
]
+97
View File
@@ -0,0 +1,97 @@
""" from https://github.com/keithito/tacotron """
import re
from typing import Dict
import inflect
_inflect = inflect.engine()
_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
_currency_re = re.compile(r"(£|\$|¥)([0-9\,\.]*[0-9]+)")
_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
_number_re = re.compile(r"-?[0-9]+")
def _remove_commas(m):
return m.group(1).replace(",", "")
def _expand_decimal_point(m):
return m.group(1).replace(".", " point ")
def __expand_currency(value: str, inflection: Dict[float, str]) -> str:
parts = value.replace(",", "").split(".")
if len(parts) > 2:
return f"{value} {inflection[2]}" # Unexpected format
text = []
integer = int(parts[0]) if parts[0] else 0
if integer > 0:
integer_unit = inflection.get(integer, inflection[2])
text.append(f"{integer} {integer_unit}")
fraction = int(parts[1]) if len(parts) > 1 and parts[1] else 0
if fraction > 0:
fraction_unit = inflection.get(fraction / 100, inflection[0.02])
text.append(f"{fraction} {fraction_unit}")
if len(text) == 0:
return f"zero {inflection[2]}"
return " ".join(text)
def _expand_currency(m: "re.Match") -> str:
currencies = {
"$": {
0.01: "cent",
0.02: "cents",
1: "dollar",
2: "dollars",
},
"": {
0.01: "cent",
0.02: "cents",
1: "euro",
2: "euros",
},
"£": {
0.01: "penny",
0.02: "pence",
1: "pound sterling",
2: "pounds sterling",
},
"¥": {
# TODO rin
0.02: "sen",
2: "yen",
},
}
unit = m.group(1)
currency = currencies[unit]
value = m.group(2)
return __expand_currency(value, currency)
def _expand_ordinal(m):
return _inflect.number_to_words(m.group(0))
def _expand_number(m):
num = int(m.group(0))
if 1000 < num < 3000:
if num == 2000:
return "two thousand"
if 2000 < num < 2010:
return "two thousand " + _inflect.number_to_words(num % 100)
if num % 100 == 0:
return _inflect.number_to_words(num // 100) + " hundred"
return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
return _inflect.number_to_words(num, andword="")
def normalize_numbers(text):
text = re.sub(_comma_number_re, _remove_commas, text)
text = re.sub(_currency_re, _expand_currency, text)
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
text = re.sub(_ordinal_re, _expand_ordinal, text)
text = re.sub(_number_re, _expand_number, text)
return text
+47
View File
@@ -0,0 +1,47 @@
import re
import inflect
_inflect = inflect.engine()
_time_re = re.compile(
r"""\b
((0?[0-9])|(1[0-1])|(1[2-9])|(2[0-3])) # hours
:
([0-5][0-9]) # minutes
\s*(a\\.m\\.|am|pm|p\\.m\\.|a\\.m|p\\.m)? # am/pm
\b""",
re.IGNORECASE | re.X,
)
def _expand_num(n: int) -> str:
return _inflect.number_to_words(n)
def _expand_time_english(match: "re.Match") -> str:
hour = int(match.group(1))
past_noon = hour >= 12
time = []
if hour > 12:
hour -= 12
elif hour == 0:
hour = 12
past_noon = True
time.append(_expand_num(hour))
minute = int(match.group(6))
if minute > 0:
if minute < 10:
time.append("oh")
time.append(_expand_num(minute))
am_pm = match.group(7)
if am_pm is None:
time.append("p m" if past_noon else "a m")
else:
time.extend(list(am_pm.replace(".", "")))
return " ".join(time)
def expand_time_english(text: str) -> str:
return re.sub(_time_re, _expand_time_english, text)
@@ -0,0 +1,48 @@
import re
# List of (regular expression, replacement) pairs for abbreviations in french:
abbreviations_fr = [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
("M", "monsieur"),
("Mlle", "mademoiselle"),
("Mlles", "mesdemoiselles"),
("Mme", "Madame"),
("Mmes", "Mesdames"),
("N.B", "nota bene"),
("M", "monsieur"),
("p.c.q", "parce que"),
("Pr", "professeur"),
("qqch", "quelque chose"),
("rdv", "rendez-vous"),
("max", "maximum"),
("min", "minimum"),
("no", "numéro"),
("adr", "adresse"),
("dr", "docteur"),
("st", "saint"),
("co", "companie"),
("jr", "junior"),
("sgt", "sergent"),
("capt", "capitain"),
("col", "colonel"),
("av", "avenue"),
("av. J.-C", "avant Jésus-Christ"),
("apr. J.-C", "après Jésus-Christ"),
("art", "article"),
("boul", "boulevard"),
("c.-à-d", "cest-à-dire"),
("etc", "et cetera"),
("ex", "exemple"),
("excl", "exclusivement"),
("boul", "boulevard"),
]
] + [
(re.compile("\\b%s" % x[0]), x[1])
for x in [
("Mlle", "mademoiselle"),
("Mlles", "mesdemoiselles"),
("Mme", "Madame"),
("Mmes", "Mesdames"),
]
]
+470
View File
@@ -0,0 +1,470 @@
# Convert Japanese text to phonemes which is
# compatible with Julius https://github.com/julius-speech/segmentation-kit
import re
import unicodedata
try:
import MeCab
except ImportError as e:
raise ImportError("Japanese requires mecab-python3 and unidic-lite.") from e
from num2words import num2words
_CONVRULES = [
# Conversion of 2 letters
"アァ/ a a",
"イィ/ i i",
"イェ/ i e",
"イャ/ y a",
"ウゥ/ u:",
"エェ/ e e",
"オォ/ o:",
"カァ/ k a:",
"キィ/ k i:",
"クゥ/ k u:",
"クャ/ ky a",
"クュ/ ky u",
"クョ/ ky o",
"ケェ/ k e:",
"コォ/ k o:",
"ガァ/ g a:",
"ギィ/ g i:",
"グゥ/ g u:",
"グャ/ gy a",
"グュ/ gy u",
"グョ/ gy o",
"ゲェ/ g e:",
"ゴォ/ g o:",
"サァ/ s a:",
"シィ/ sh i:",
"スゥ/ s u:",
"スャ/ sh a",
"スュ/ sh u",
"スョ/ sh o",
"セェ/ s e:",
"ソォ/ s o:",
"ザァ/ z a:",
"ジィ/ j i:",
"ズゥ/ z u:",
"ズャ/ zy a",
"ズュ/ zy u",
"ズョ/ zy o",
"ゼェ/ z e:",
"ゾォ/ z o:",
"タァ/ t a:",
"チィ/ ch i:",
"ツァ/ ts a",
"ツィ/ ts i",
"ツゥ/ ts u:",
"ツャ/ ch a",
"ツュ/ ch u",
"ツョ/ ch o",
"ツェ/ ts e",
"ツォ/ ts o",
"テェ/ t e:",
"トォ/ t o:",
"ダァ/ d a:",
"ヂィ/ j i:",
"ヅゥ/ d u:",
"ヅャ/ zy a",
"ヅュ/ zy u",
"ヅョ/ zy o",
"デェ/ d e:",
"ドォ/ d o:",
"ナァ/ n a:",
"ニィ/ n i:",
"ヌゥ/ n u:",
"ヌャ/ ny a",
"ヌュ/ ny u",
"ヌョ/ ny o",
"ネェ/ n e:",
"ノォ/ n o:",
"ハァ/ h a:",
"ヒィ/ h i:",
"フゥ/ f u:",
"フャ/ hy a",
"フュ/ hy u",
"フョ/ hy o",
"ヘェ/ h e:",
"ホォ/ h o:",
"バァ/ b a:",
"ビィ/ b i:",
"ブゥ/ b u:",
"フャ/ hy a",
"ブュ/ by u",
"フョ/ hy o",
"ベェ/ b e:",
"ボォ/ b o:",
"パァ/ p a:",
"ピィ/ p i:",
"プゥ/ p u:",
"プャ/ py a",
"プュ/ py u",
"プョ/ py o",
"ペェ/ p e:",
"ポォ/ p o:",
"マァ/ m a:",
"ミィ/ m i:",
"ムゥ/ m u:",
"ムャ/ my a",
"ムュ/ my u",
"ムョ/ my o",
"メェ/ m e:",
"モォ/ m o:",
"ヤァ/ y a:",
"ユゥ/ y u:",
"ユャ/ y a:",
"ユュ/ y u:",
"ユョ/ y o:",
"ヨォ/ y o:",
"ラァ/ r a:",
"リィ/ r i:",
"ルゥ/ r u:",
"ルャ/ ry a",
"ルュ/ ry u",
"ルョ/ ry o",
"レェ/ r e:",
"ロォ/ r o:",
"ワァ/ w a:",
"ヲォ/ o:",
"ディ/ d i",
"デェ/ d e:",
"デャ/ dy a",
"デュ/ dy u",
"デョ/ dy o",
"ティ/ t i",
"テェ/ t e:",
"テャ/ ty a",
"テュ/ ty u",
"テョ/ ty o",
"スィ/ s i",
"ズァ/ z u a",
"ズィ/ z i",
"ズゥ/ z u",
"ズャ/ zy a",
"ズュ/ zy u",
"ズョ/ zy o",
"ズェ/ z e",
"ズォ/ z o",
"キャ/ ky a",
"キュ/ ky u",
"キョ/ ky o",
"シャ/ sh a",
"シュ/ sh u",
"シェ/ sh e",
"ショ/ sh o",
"チャ/ ch a",
"チュ/ ch u",
"チェ/ ch e",
"チョ/ ch o",
"トゥ/ t u",
"トャ/ ty a",
"トュ/ ty u",
"トョ/ ty o",
"ドァ/ d o a",
"ドゥ/ d u",
"ドャ/ dy a",
"ドュ/ dy u",
"ドョ/ dy o",
"ドォ/ d o:",
"ニャ/ ny a",
"ニュ/ ny u",
"ニョ/ ny o",
"ヒャ/ hy a",
"ヒュ/ hy u",
"ヒョ/ hy o",
"ミャ/ my a",
"ミュ/ my u",
"ミョ/ my o",
"リャ/ ry a",
"リュ/ ry u",
"リョ/ ry o",
"ギャ/ gy a",
"ギュ/ gy u",
"ギョ/ gy o",
"ヂェ/ j e",
"ヂャ/ j a",
"ヂュ/ j u",
"ヂョ/ j o",
"ジェ/ j e",
"ジャ/ j a",
"ジュ/ j u",
"ジョ/ j o",
"ビャ/ by a",
"ビュ/ by u",
"ビョ/ by o",
"ピャ/ py a",
"ピュ/ py u",
"ピョ/ py o",
"ウァ/ u a",
"ウィ/ w i",
"ウェ/ w e",
"ウォ/ w o",
"ファ/ f a",
"フィ/ f i",
"フゥ/ f u",
"フャ/ hy a",
"フュ/ hy u",
"フョ/ hy o",
"フェ/ f e",
"フォ/ f o",
"ヴァ/ b a",
"ヴィ/ b i",
"ヴェ/ b e",
"ヴォ/ b o",
"ヴュ/ by u",
# Conversion of 1 letter
"ア/ a",
"イ/ i",
"ウ/ u",
"エ/ e",
"オ/ o",
"カ/ k a",
"キ/ k i",
"ク/ k u",
"ケ/ k e",
"コ/ k o",
"サ/ s a",
"シ/ sh i",
"ス/ s u",
"セ/ s e",
"ソ/ s o",
"タ/ t a",
"チ/ ch i",
"ツ/ ts u",
"テ/ t e",
"ト/ t o",
"ナ/ n a",
"ニ/ n i",
"ヌ/ n u",
"ネ/ n e",
"/ n o",
"ハ/ h a",
"ヒ/ h i",
"フ/ f u",
"ヘ/ h e",
"ホ/ h o",
"マ/ m a",
"ミ/ m i",
"ム/ m u",
"メ/ m e",
"モ/ m o",
"ラ/ r a",
"リ/ r i",
"ル/ r u",
"レ/ r e",
"ロ/ r o",
"ガ/ g a",
"ギ/ g i",
"グ/ g u",
"ゲ/ g e",
"ゴ/ g o",
"ザ/ z a",
"ジ/ j i",
"ズ/ z u",
"ゼ/ z e",
"ゾ/ z o",
"ダ/ d a",
"ヂ/ j i",
"ヅ/ z u",
"デ/ d e",
"ド/ d o",
"バ/ b a",
"ビ/ b i",
"ブ/ b u",
"ベ/ b e",
"ボ/ b o",
"パ/ p a",
"ピ/ p i",
"プ/ p u",
"ペ/ p e",
"ポ/ p o",
"ヤ/ y a",
"ユ/ y u",
"ヨ/ y o",
"ワ/ w a",
"ヰ/ i",
"ヱ/ e",
"ヲ/ o",
"ン/ N",
"ッ/ q",
"ヴ/ b u",
"ー/:",
# Try converting broken text
"ァ/ a",
"ィ/ i",
"ゥ/ u",
"ェ/ e",
"ォ/ o",
"ヮ/ w a",
"ォ/ o",
# Symbols
"、/ ,",
"。/ .",
"/ !",
"/ ?",
"・/ ,",
]
_COLON_RX = re.compile(":+")
_REJECT_RX = re.compile("[^ a-zA-Z:,.?]")
def _makerulemap():
l = [tuple(x.split("/")) for x in _CONVRULES]
return tuple({k: v for k, v in l if len(k) == i} for i in (1, 2))
_RULEMAP1, _RULEMAP2 = _makerulemap()
def kata2phoneme(text: str) -> str:
"""Convert katakana text to phonemes."""
text = text.strip()
res = ""
while text:
if len(text) >= 2:
x = _RULEMAP2.get(text[:2])
if x is not None:
text = text[2:]
res += x
continue
x = _RULEMAP1.get(text[0])
if x is not None:
text = text[1:]
res += x
continue
res += " " + text[0]
text = text[1:]
res = _COLON_RX.sub(":", res)
return res[1:]
_KATAKANA = "".join(chr(ch) for ch in range(ord(""), ord("") + 1))
_HIRAGANA = "".join(chr(ch) for ch in range(ord(""), ord("") + 1))
_HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA)
def hira2kata(text: str) -> str:
text = text.translate(_HIRA2KATATRANS)
return text.replace("う゛", "")
_SYMBOL_TOKENS = set(list("・、。?!"))
_NO_YOMI_TOKENS = set(list("「」『』―()[][] …"))
_TAGGER = MeCab.Tagger()
def text2kata(text: str) -> str:
parsed = _TAGGER.parse(text)
res = []
for line in parsed.split("\n"):
if line == "EOS":
break
parts = line.split("\t")
word, yomi = parts[0], parts[1]
if yomi:
res.append(yomi)
else:
if word in _SYMBOL_TOKENS:
res.append(word)
elif word in ("", ""):
res.append("")
elif word in _NO_YOMI_TOKENS:
pass
else:
res.append(word)
return hira2kata("".join(res))
_ALPHASYMBOL_YOMI = {
"#": "シャープ",
"%": "パーセント",
"&": "アンド",
"+": "プラス",
"-": "マイナス",
":": "コロン",
";": "セミコロン",
"<": "小なり",
"=": "イコール",
">": "大なり",
"@": "アット",
"a": "エー",
"b": "ビー",
"c": "シー",
"d": "ディー",
"e": "イー",
"f": "エフ",
"g": "ジー",
"h": "エイチ",
"i": "アイ",
"j": "ジェー",
"k": "ケー",
"l": "エル",
"m": "エム",
"n": "エヌ",
"o": "オー",
"p": "ピー",
"q": "キュー",
"r": "アール",
"s": "エス",
"t": "ティー",
"u": "ユー",
"v": "ブイ",
"w": "ダブリュー",
"x": "エックス",
"y": "ワイ",
"z": "ゼット",
"α": "アルファ",
"β": "ベータ",
"γ": "ガンマ",
"δ": "デルタ",
"ε": "イプシロン",
"ζ": "ゼータ",
"η": "イータ",
"θ": "シータ",
"ι": "イオタ",
"κ": "カッパ",
"λ": "ラムダ",
"μ": "ミュー",
"ν": "ニュー",
"ξ": "クサイ",
"ο": "オミクロン",
"π": "パイ",
"ρ": "ロー",
"σ": "シグマ",
"τ": "タウ",
"υ": "ウプシロン",
"φ": "ファイ",
"χ": "カイ",
"ψ": "プサイ",
"ω": "オメガ",
}
_NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+")
_CURRENCY_MAP = {"$": "ドル", "¥": "", "£": "ポンド", "": "ユーロ"}
_CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])")
_NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?")
def japanese_convert_numbers_to_words(text: str) -> str:
res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text)
res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res)
res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res)
return res
def japanese_convert_alpha_symbols_to_words(text: str) -> str:
return "".join([_ALPHASYMBOL_YOMI.get(ch, ch) for ch in text.lower()])
def japanese_text_to_phonemes(text: str) -> str:
"""Convert Japanese text to phonemes."""
res = unicodedata.normalize("NFKC", text)
res = japanese_convert_numbers_to_words(res)
res = japanese_convert_alpha_symbols_to_words(res)
res = text2kata(res)
res = kata2phoneme(res)
return res.replace(" ", "")
@@ -0,0 +1,44 @@
# coding: utf-8
# Add the word you want to the dictionary.
etc_dictionary = {"1+1": "원플러스원", "2+1": "투플러스원"}
english_dictionary = {
"KOREA": "코리아",
"IDOL": "아이돌",
"IT": "아이티",
"IQ": "아이큐",
"UP": "",
"DOWN": "다운",
"PC": "피씨",
"CCTV": "씨씨티비",
"SNS": "에스엔에스",
"AI": "에이아이",
"CEO": "씨이오",
"A": "에이",
"B": "",
"C": "",
"D": "",
"E": "",
"F": "에프",
"G": "",
"H": "에이치",
"I": "아이",
"J": "제이",
"K": "케이",
"L": "",
"M": "",
"N": "",
"O": "",
"P": "",
"Q": "",
"R": "",
"S": "에스",
"T": "",
"U": "",
"V": "브이",
"W": "더블유",
"X": "엑스",
"Y": "와이",
"Z": "제트",
}
+32
View File
@@ -0,0 +1,32 @@
# coding: utf-8
# Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py
import re
from TTS.tts.utils.text.korean.ko_dictionary import english_dictionary, etc_dictionary
def normalize(text):
text = text.strip()
text = re.sub("[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text)
text = normalize_with_dictionary(text, etc_dictionary)
text = normalize_english(text)
text = text.lower()
return text
def normalize_with_dictionary(text, dic):
if any(key in text for key in dic.keys()):
pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))
return pattern.sub(lambda x: dic[x.group()], text)
return text
def normalize_english(text):
def fn(m):
word = m.group()
if word in english_dictionary:
return english_dictionary.get(word)
return word
text = re.sub("([A-Za-z]+)", fn, text)
return text
+36
View File
@@ -0,0 +1,36 @@
from jamo import hangul_to_jamo
from TTS.tts.utils.text.korean.korean import normalize
g2p = None
def korean_text_to_phonemes(text, character: str = "hangeul") -> str:
"""
The input and output values look the same, but they are different in Unicode.
example :
input = '하늘' (Unicode : \ud558\ub298), (하 + 늘)
output = '하늘' (Unicode :\u1112\u1161\u1102\u1173\u11af), (ᄒ + ᅡ + ᄂ + ᅳ + ᆯ)
"""
global g2p # pylint: disable=global-statement
if g2p is None:
from g2pkk import G2p
g2p = G2p()
if character == "english":
from anyascii import anyascii
text = normalize(text)
text = g2p(text)
text = anyascii(text)
return text
text = normalize(text)
text = g2p(text)
text = list(hangul_to_jamo(text)) # '하늘' --> ['ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆯ']
return "".join(text)
@@ -0,0 +1,79 @@
from TTS.tts.utils.text.phonemizers.bangla_phonemizer import BN_Phonemizer
from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
from TTS.tts.utils.text.phonemizers.belarusian_phonemizer import BEL_Phonemizer
from TTS.tts.utils.text.phonemizers.espeak_wrapper import ESpeak
from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
from TTS.tts.utils.text.phonemizers.ko_kr_phonemizer import KO_KR_Phonemizer
from TTS.tts.utils.text.phonemizers.zh_cn_phonemizer import ZH_CN_Phonemizer
try:
from TTS.tts.utils.text.phonemizers.ja_jp_phonemizer import JA_JP_Phonemizer
except ImportError:
JA_JP_Phonemizer = None
pass
PHONEMIZERS = {b.name(): b for b in (ESpeak, Gruut, KO_KR_Phonemizer, BN_Phonemizer)}
ESPEAK_LANGS = list(ESpeak.supported_languages().keys())
GRUUT_LANGS = list(Gruut.supported_languages())
# Dict setting default phonemizers for each language
# Add Gruut languages
_ = [Gruut.name()] * len(GRUUT_LANGS)
DEF_LANG_TO_PHONEMIZER = dict(list(zip(GRUUT_LANGS, _)))
# Add ESpeak languages and override any existing ones
_ = [ESpeak.name()] * len(ESPEAK_LANGS)
_new_dict = dict(list(zip(list(ESPEAK_LANGS), _)))
DEF_LANG_TO_PHONEMIZER.update(_new_dict)
# Force default for some languages
DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"]
DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name()
DEF_LANG_TO_PHONEMIZER["ko-kr"] = KO_KR_Phonemizer.name()
DEF_LANG_TO_PHONEMIZER["bn"] = BN_Phonemizer.name()
DEF_LANG_TO_PHONEMIZER["be"] = BEL_Phonemizer.name()
# JA phonemizer has deal breaking dependencies like MeCab for some systems.
# So we only have it when we have it.
if JA_JP_Phonemizer is not None:
PHONEMIZERS[JA_JP_Phonemizer.name()] = JA_JP_Phonemizer
DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name()
def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
"""Initiate a phonemizer by name
Args:
name (str):
Name of the phonemizer that should match `phonemizer.name()`.
kwargs (dict):
Extra keyword arguments that should be passed to the phonemizer.
"""
if name == "espeak":
return ESpeak(**kwargs)
if name == "gruut":
return Gruut(**kwargs)
if name == "zh_cn_phonemizer":
return ZH_CN_Phonemizer(**kwargs)
if name == "ja_jp_phonemizer":
if JA_JP_Phonemizer is None:
raise ValueError(" ❗ You need to install JA phonemizer dependencies. Try `pip install TTS[ja]`.")
return JA_JP_Phonemizer(**kwargs)
if name == "ko_kr_phonemizer":
return KO_KR_Phonemizer(**kwargs)
if name == "bn_phonemizer":
return BN_Phonemizer(**kwargs)
if name == "be_phonemizer":
return BEL_Phonemizer(**kwargs)
raise ValueError(f"Phonemizer {name} not found")
if __name__ == "__main__":
print(DEF_LANG_TO_PHONEMIZER)
@@ -0,0 +1,62 @@
from typing import Dict
from TTS.tts.utils.text.bangla.phonemizer import bangla_text_to_phonemes
from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
_DEF_ZH_PUNCS = "、.,[]()?!〽~『』「」【】"
class BN_Phonemizer(BasePhonemizer):
"""🐸TTS bn phonemizer using functions in `TTS.tts.utils.text.bangla.phonemizer`
Args:
punctuations (str):
Set of characters to be treated as punctuation. Defaults to `_DEF_ZH_PUNCS`.
keep_puncs (bool):
If True, keep the punctuations after phonemization. Defaults to False.
Example ::
"这是,样本中文。" -> `d|ʒ|ø|4| |ʂ|ʏ|4| || |i|ɑ|ŋ|4|b|œ|n|3| |d|ʒ|o|ŋ|1|w|œ|n|2| |。`
TODO: someone with Bangla knowledge should check this implementation
"""
language = "bn"
def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs): # pylint: disable=unused-argument
super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
@staticmethod
def name():
return "bn_phonemizer"
@staticmethod
def phonemize_bn(text: str, separator: str = "|") -> str: # pylint: disable=unused-argument
ph = bangla_text_to_phonemes(text)
return ph
def _phonemize(self, text, separator):
return self.phonemize_bn(text, separator)
@staticmethod
def supported_languages() -> Dict:
return {"bn": "Bangla"}
def version(self) -> str:
return "0.0.1"
def is_available(self) -> bool:
return True
if __name__ == "__main__":
txt = "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে, কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয়, তখনও যেন বলে."
e = BN_Phonemizer()
print(e.supported_languages())
print(e.version())
print(e.language)
print(e.name())
print(e.is_available())
print("`" + e.phonemize(txt) + "`")
+140
View File
@@ -0,0 +1,140 @@
import abc
from typing import List, Tuple
from TTS.tts.utils.text.punctuation import Punctuation
class BasePhonemizer(abc.ABC):
"""Base phonemizer class
Phonemization follows the following steps:
1. Preprocessing:
- remove empty lines
- remove punctuation
- keep track of punctuation marks
2. Phonemization:
- convert text to phonemes
3. Postprocessing:
- join phonemes
- restore punctuation marks
Args:
language (str):
Language used by the phonemizer.
punctuations (List[str]):
List of punctuation marks to be preserved.
keep_puncs (bool):
Whether to preserve punctuation marks or not.
"""
def __init__(self, language, punctuations=Punctuation.default_puncs(), keep_puncs=False):
# ensure the backend is installed on the system
if not self.is_available():
raise RuntimeError("{} not installed on your system".format(self.name())) # pragma: nocover
# ensure the backend support the requested language
self._language = self._init_language(language)
# setup punctuation processing
self._keep_puncs = keep_puncs
self._punctuator = Punctuation(punctuations)
def _init_language(self, language):
"""Language initialization
This method may be overloaded in child classes (see Segments backend)
"""
if not self.is_supported_language(language):
raise RuntimeError(f'language "{language}" is not supported by the ' f"{self.name()} backend")
return language
@property
def language(self):
"""The language code configured to be used for phonemization"""
return self._language
@staticmethod
@abc.abstractmethod
def name():
"""The name of the backend"""
...
@classmethod
@abc.abstractmethod
def is_available(cls):
"""Returns True if the backend is installed, False otherwise"""
...
@classmethod
@abc.abstractmethod
def version(cls):
"""Return the backend version as a tuple (major, minor, patch)"""
...
@staticmethod
@abc.abstractmethod
def supported_languages():
"""Return a dict of language codes -> name supported by the backend"""
...
def is_supported_language(self, language):
"""Returns True if `language` is supported by the backend"""
return language in self.supported_languages()
@abc.abstractmethod
def _phonemize(self, text, separator):
"""The main phonemization method"""
def _phonemize_preprocess(self, text) -> Tuple[List[str], List]:
"""Preprocess the text before phonemization
1. remove spaces
2. remove punctuation
Override this if you need a different behaviour
"""
text = text.strip()
if self._keep_puncs:
# a tuple (text, punctuation marks)
return self._punctuator.strip_to_restore(text)
return [self._punctuator.strip(text)], []
def _phonemize_postprocess(self, phonemized, punctuations) -> str:
"""Postprocess the raw phonemized output
Override this if you need a different behaviour
"""
if self._keep_puncs:
return self._punctuator.restore(phonemized, punctuations)[0]
return phonemized[0]
def phonemize(self, text: str, separator="|", language: str = None) -> str: # pylint: disable=unused-argument
"""Returns the `text` phonemized for the given language
Args:
text (str):
Text to be phonemized.
separator (str):
string separator used between phonemes. Default to '_'.
Returns:
(str): Phonemized text
"""
text, punctuations = self._phonemize_preprocess(text)
phonemized = []
for t in text:
p = self._phonemize(t, separator)
phonemized.append(p)
phonemized = self._phonemize_postprocess(phonemized, punctuations)
return phonemized
def print_logs(self, level: int = 0):
indent = "\t" * level
print(f"{indent}| > phoneme language: {self.language}")
print(f"{indent}| > phoneme backend: {self.name()}")
@@ -0,0 +1,55 @@
from typing import Dict
from TTS.tts.utils.text.belarusian.phonemizer import belarusian_text_to_phonemes
from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
_DEF_BE_PUNCS = ",!." # TODO
class BEL_Phonemizer(BasePhonemizer):
"""🐸TTS be phonemizer using functions in `TTS.tts.utils.text.belarusian.phonemizer`
Args:
punctuations (str):
Set of characters to be treated as punctuation. Defaults to `_DEF_BE_PUNCS`.
keep_puncs (bool):
If True, keep the punctuations after phonemization. Defaults to False.
"""
language = "be"
def __init__(self, punctuations=_DEF_BE_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument
super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
@staticmethod
def name():
return "be_phonemizer"
@staticmethod
def phonemize_be(text: str, separator: str = "|") -> str: # pylint: disable=unused-argument
return belarusian_text_to_phonemes(text)
def _phonemize(self, text, separator):
return self.phonemize_be(text, separator)
@staticmethod
def supported_languages() -> Dict:
return {"be": "Belarusian"}
def version(self) -> str:
return "0.0.1"
def is_available(self) -> bool:
return True
if __name__ == "__main__":
txt = "тэст"
e = BEL_Phonemizer()
print(e.supported_languages())
print(e.version())
print(e.language)
print(e.name())
print(e.is_available())
print("`" + e.phonemize(txt) + "`")
@@ -0,0 +1,264 @@
import logging
import re
import subprocess
from typing import Dict, List
from packaging.version import Version
from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
from TTS.tts.utils.text.punctuation import Punctuation
def is_tool(name):
from shutil import which
return which(name) is not None
# Use a regex pattern to match the espeak version, because it may be
# symlinked to espeak-ng, which moves the version bits to another spot.
espeak_version_pattern = re.compile(r"text-to-speech:\s(?P<version>\d+\.\d+(\.\d+)?)")
def get_espeak_version():
output = subprocess.getoutput("espeak --version")
match = espeak_version_pattern.search(output)
return match.group("version")
def get_espeakng_version():
output = subprocess.getoutput("espeak-ng --version")
return output.split()[3]
# priority: espeakng > espeak
if is_tool("espeak-ng"):
_DEF_ESPEAK_LIB = "espeak-ng"
_DEF_ESPEAK_VER = get_espeakng_version()
elif is_tool("espeak"):
_DEF_ESPEAK_LIB = "espeak"
_DEF_ESPEAK_VER = get_espeak_version()
else:
_DEF_ESPEAK_LIB = None
_DEF_ESPEAK_VER = None
def _espeak_exe(espeak_lib: str, args: List, sync=False) -> List[str]:
"""Run espeak with the given arguments."""
cmd = [
espeak_lib,
"-q",
"-b",
"1", # UTF8 text encoding
]
cmd.extend(args)
logging.debug("espeakng: executing %s", repr(cmd))
with subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
) as p:
res = iter(p.stdout.readline, b"")
if not sync:
p.stdout.close()
if p.stderr:
p.stderr.close()
if p.stdin:
p.stdin.close()
return res
res2 = []
for line in res:
res2.append(line)
p.stdout.close()
if p.stderr:
p.stderr.close()
if p.stdin:
p.stdin.close()
p.wait()
return res2
class ESpeak(BasePhonemizer):
"""ESpeak wrapper calling `espeak` or `espeak-ng` from the command-line the perform G2P
Args:
language (str):
Valid language code for the used backend.
backend (str):
Name of the backend library to use. `espeak` or `espeak-ng`. If None, set automatically
prefering `espeak-ng` over `espeak`. Defaults to None.
punctuations (str):
Characters to be treated as punctuation. Defaults to Punctuation.default_puncs().
keep_puncs (bool):
If True, keep the punctuations after phonemization. Defaults to True.
Example:
>>> from TTS.tts.utils.text.phonemizers import ESpeak
>>> phonemizer = ESpeak("tr")
>>> phonemizer.phonemize("Bu Türkçe, bir örnektir.", separator="|")
'b|ʊ t|ˈø|r|k|tʃ|ɛ, b|ɪ|r œ|r|n|ˈɛ|c|t|ɪ|r.'
"""
_ESPEAK_LIB = _DEF_ESPEAK_LIB
_ESPEAK_VER = _DEF_ESPEAK_VER
def __init__(self, language: str, backend=None, punctuations=Punctuation.default_puncs(), keep_puncs=True):
if self._ESPEAK_LIB is None:
raise Exception(" [!] No espeak backend found. Install espeak-ng or espeak to your system.")
self.backend = self._ESPEAK_LIB
# band-aid for backwards compatibility
if language == "en":
language = "en-us"
if language == "zh-cn":
language = "cmn"
super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs)
if backend is not None:
self.backend = backend
@property
def backend(self):
return self._ESPEAK_LIB
@property
def backend_version(self):
return self._ESPEAK_VER
@backend.setter
def backend(self, backend):
if backend not in ["espeak", "espeak-ng"]:
raise Exception("Unknown backend: %s" % backend)
self._ESPEAK_LIB = backend
self._ESPEAK_VER = get_espeakng_version() if backend == "espeak-ng" else get_espeak_version()
def auto_set_espeak_lib(self) -> None:
if is_tool("espeak-ng"):
self._ESPEAK_LIB = "espeak-ng"
self._ESPEAK_VER = get_espeakng_version()
elif is_tool("espeak"):
self._ESPEAK_LIB = "espeak"
self._ESPEAK_VER = get_espeak_version()
else:
raise Exception("Cannot set backend automatically. espeak-ng or espeak not found")
@staticmethod
def name():
return "espeak"
def phonemize_espeak(self, text: str, separator: str = "|", tie=False) -> str:
"""Convert input text to phonemes.
Args:
text (str):
Text to be converted to phonemes.
tie (bool, optional) : When True use a '͡' character between
consecutive characters of a single phoneme. Else separate phoneme
with '_'. This option requires espeak>=1.49. Default to False.
"""
# set arguments
args = ["-v", f"{self._language}"]
# espeak and espeak-ng parses `ipa` differently
if tie:
# use '͡' between phonemes
if self.backend == "espeak":
args.append("--ipa=1")
else:
args.append("--ipa=3")
else:
# split with '_'
if self.backend == "espeak":
if Version(self.backend_version) >= Version("1.48.15"):
args.append("--ipa=1")
else:
args.append("--ipa=3")
else:
args.append("--ipa=1")
if tie:
args.append("--tie=%s" % tie)
args.append(text)
# compute phonemes
phonemes = ""
for line in _espeak_exe(self._ESPEAK_LIB, args, sync=True):
logging.debug("line: %s", repr(line))
ph_decoded = line.decode("utf8").strip()
# espeak:
# version 1.48.15: " p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
# espeak-ng:
# "p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
# espeak-ng backend can add language flags that need to be removed:
# "sɛʁtˈɛ̃ mˈo kɔm (en)fˈʊtbɔːl(fr) ʒenˈɛʁ de- flˈaɡ də- lˈɑ̃ɡ."
# phonemize needs to remove the language flags of the returned text:
# "sɛʁtˈɛ̃ mˈo kɔm fˈʊtbɔːl ʒenˈɛʁ de- flˈaɡ də- lˈɑ̃ɡ."
ph_decoded = re.sub(r"\(.+?\)", "", ph_decoded)
phonemes += ph_decoded.strip()
return phonemes.replace("_", separator)
def _phonemize(self, text, separator=None):
return self.phonemize_espeak(text, separator, tie=False)
@staticmethod
def supported_languages() -> Dict:
"""Get a dictionary of supported languages.
Returns:
Dict: Dictionary of language codes.
"""
if _DEF_ESPEAK_LIB is None:
return {}
args = ["--voices"]
langs = {}
count = 0
for line in _espeak_exe(_DEF_ESPEAK_LIB, args, sync=True):
line = line.decode("utf8").strip()
if count > 0:
cols = line.split()
lang_code = cols[1]
lang_name = cols[3]
langs[lang_code] = lang_name
logging.debug("line: %s", repr(line))
count += 1
return langs
def version(self) -> str:
"""Get the version of the used backend.
Returns:
str: Version of the used backend.
"""
args = ["--version"]
for line in _espeak_exe(self.backend, args, sync=True):
version = line.decode("utf8").strip().split()[2]
logging.debug("line: %s", repr(line))
return version
@classmethod
def is_available(cls):
"""Return true if ESpeak is available else false"""
return is_tool("espeak") or is_tool("espeak-ng")
if __name__ == "__main__":
e = ESpeak(language="en-us")
print(e.supported_languages())
print(e.version())
print(e.language)
print(e.name())
print(e.is_available())
e = ESpeak(language="en-us", keep_puncs=False)
print("`" + e.phonemize("hello how are you today?") + "`")
e = ESpeak(language="en-us", keep_puncs=True)
print("`" + e.phonemize("hello how are you today?") + "`")
@@ -0,0 +1,151 @@
import importlib
from typing import List
import gruut
from gruut_ipa import IPA
from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
from TTS.tts.utils.text.punctuation import Punctuation
# Table for str.translate to fix gruut/TTS phoneme mismatch
GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ")
class Gruut(BasePhonemizer):
"""Gruut wrapper for G2P
Args:
language (str):
Valid language code for the used backend.
punctuations (str):
Characters to be treated as punctuation. Defaults to `Punctuation.default_puncs()`.
keep_puncs (bool):
If true, keep the punctuations after phonemization. Defaults to True.
use_espeak_phonemes (bool):
If true, use espeak lexicons instead of default Gruut lexicons. Defaults to False.
keep_stress (bool):
If true, keep the stress characters after phonemization. Defaults to False.
Example:
>>> from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
>>> phonemizer = Gruut('en-us')
>>> phonemizer.phonemize("Be a voice, not an! echo?", separator="|")
'b|i| ə| v|ɔ|ɪ|s, n|ɑ|t| ə|n! ɛ|k|o|ʊ?'
"""
def __init__(
self,
language: str,
punctuations=Punctuation.default_puncs(),
keep_puncs=True,
use_espeak_phonemes=False,
keep_stress=False,
):
super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs)
self.use_espeak_phonemes = use_espeak_phonemes
self.keep_stress = keep_stress
@staticmethod
def name():
return "gruut"
def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str: # pylint: disable=unused-argument
"""Convert input text to phonemes.
Gruut phonemizes the given `str` by seperating each phoneme character with `separator`, even for characters
that constitude a single sound.
It doesn't affect 🐸TTS since it individually converts each character to token IDs.
Examples::
"hello how are you today?" -> `h|ɛ|l|o|ʊ| h|a|ʊ| ɑ|ɹ| j|u| t|ə|d|e|ɪ`
Args:
text (str):
Text to be converted to phonemes.
tie (bool, optional) : When True use a '͡' character between
consecutive characters of a single phoneme. Else separate phoneme
with '_'. This option requires espeak>=1.49. Default to False.
"""
ph_list = []
for sentence in gruut.sentences(text, lang=self.language, espeak=self.use_espeak_phonemes):
for word in sentence:
if word.is_break:
# Use actual character for break phoneme (e.g., comma)
if ph_list:
# Join with previous word
ph_list[-1].append(word.text)
else:
# First word is punctuation
ph_list.append([word.text])
elif word.phonemes:
# Add phonemes for word
word_phonemes = []
for word_phoneme in word.phonemes:
if not self.keep_stress:
# Remove primary/secondary stress
word_phoneme = IPA.without_stress(word_phoneme)
word_phoneme = word_phoneme.translate(GRUUT_TRANS_TABLE)
if word_phoneme:
# Flatten phonemes
word_phonemes.extend(word_phoneme)
if word_phonemes:
ph_list.append(word_phonemes)
ph_words = [separator.join(word_phonemes) for word_phonemes in ph_list]
ph = f"{separator} ".join(ph_words)
return ph
def _phonemize(self, text, separator):
return self.phonemize_gruut(text, separator, tie=False)
def is_supported_language(self, language):
"""Returns True if `language` is supported by the backend"""
return gruut.is_language_supported(language)
@staticmethod
def supported_languages() -> List:
"""Get a dictionary of supported languages.
Returns:
List: List of language codes.
"""
return list(gruut.get_supported_languages())
def version(self):
"""Get the version of the used backend.
Returns:
str: Version of the used backend.
"""
return gruut.__version__
@classmethod
def is_available(cls):
"""Return true if ESpeak is available else false"""
return importlib.util.find_spec("gruut") is not None
if __name__ == "__main__":
e = Gruut(language="en-us")
print(e.supported_languages())
print(e.version())
print(e.language)
print(e.name())
print(e.is_available())
e = Gruut(language="en-us", keep_puncs=False)
print("`" + e.phonemize("hello how are you today?") + "`")
e = Gruut(language="en-us", keep_puncs=True)
print("`" + e.phonemize("hello how, are you today?") + "`")
@@ -0,0 +1,72 @@
from typing import Dict
from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes
from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
_DEF_JA_PUNCS = "、.,[]()?!〽~『』「」【】"
_TRANS_TABLE = {"": ","}
def trans(text):
for i, j in _TRANS_TABLE.items():
text = text.replace(i, j)
return text
class JA_JP_Phonemizer(BasePhonemizer):
"""🐸TTS Ja-Jp phonemizer using functions in `TTS.tts.utils.text.japanese.phonemizer`
TODO: someone with JA knowledge should check this implementation
Example:
>>> from TTS.tts.utils.text.phonemizers import JA_JP_Phonemizer
>>> phonemizer = JA_JP_Phonemizer()
>>> phonemizer.phonemize("どちらに行きますか?", separator="|")
'd|o|c|h|i|r|a|n|i|i|k|i|m|a|s|u|k|a|?'
"""
language = "ja-jp"
def __init__(self, punctuations=_DEF_JA_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument
super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
@staticmethod
def name():
return "ja_jp_phonemizer"
def _phonemize(self, text: str, separator: str = "|") -> str:
ph = japanese_text_to_phonemes(text)
if separator is not None or separator != "":
return separator.join(ph)
return ph
def phonemize(self, text: str, separator="|", language=None) -> str:
"""Custom phonemize for JP_JA
Skip pre-post processing steps used by the other phonemizers.
"""
return self._phonemize(text, separator)
@staticmethod
def supported_languages() -> Dict:
return {"ja-jp": "Japanese (Japan)"}
def version(self) -> str:
return "0.0.1"
def is_available(self) -> bool:
return True
# if __name__ == "__main__":
# text = "これは、電話をかけるための私の日本語の例のテキストです。"
# e = JA_JP_Phonemizer()
# print(e.supported_languages())
# print(e.version())
# print(e.language)
# print(e.name())
# print(e.is_available())
# print("`" + e.phonemize(text) + "`")
@@ -0,0 +1,65 @@
from typing import Dict
from TTS.tts.utils.text.korean.phonemizer import korean_text_to_phonemes
from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
_DEF_KO_PUNCS = "、.,[]()?!〽~『』「」【】"
class KO_KR_Phonemizer(BasePhonemizer):
"""🐸TTS ko_kr_phonemizer using functions in `TTS.tts.utils.text.korean.phonemizer`
TODO: Add Korean to character (ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄋᄌᄍᄎᄏᄐᄑ하ᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵᆨᆩᆪᆫᆬᆭᆮᆯᆰᆱᆲᆳᆴᆵᆶᆷᆸᆹᆺᆻᆼᆽᆾᆿᇀᇁᇂ)
Example:
>>> from TTS.tts.utils.text.phonemizers import KO_KR_Phonemizer
>>> phonemizer = KO_KR_Phonemizer()
>>> phonemizer.phonemize("이 문장은 음성합성 테스트를 위한 문장입니다.", separator="|")
'ᄋ|ᅵ| |ᄆ|ᅮ|ᆫ|ᄌ|ᅡ|ᆼ|ᄋ|ᅳ| |ᄂ|ᅳ|ᆷ|ᄉ|ᅥ|ᆼ|ᄒ|ᅡ|ᆸ|ᄊ|ᅥ|ᆼ| |ᄐ|ᅦ|ᄉ|ᅳ|ᄐ|ᅳ|ᄅ|ᅳ| |ᄅ|ᅱ|ᄒ|ᅡ|ᆫ| |ᄆ|ᅮ|ᆫ|ᄌ|ᅡ|ᆼ|ᄋ|ᅵ|ᆷ|ᄂ|ᅵ|ᄃ|ᅡ|.'
>>> from TTS.tts.utils.text.phonemizers import KO_KR_Phonemizer
>>> phonemizer = KO_KR_Phonemizer()
>>> phonemizer.phonemize("이 문장은 음성합성 테스트를 위한 문장입니다.", separator="|", character='english')
'I| |M|u|n|J|a|n|g|E|u| |N|e|u|m|S|e|o|n|g|H|a|b|S|s|e|o|n|g| |T|e|S|e|u|T|e|u|L|e|u| |L|w|i|H|a|n| |M|u|n|J|a|n|g|I|m|N|i|D|a|.'
"""
language = "ko-kr"
def __init__(self, punctuations=_DEF_KO_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument
super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
@staticmethod
def name():
return "ko_kr_phonemizer"
def _phonemize(self, text: str, separator: str = "", character: str = "hangeul") -> str:
ph = korean_text_to_phonemes(text, character=character)
if separator is not None or separator != "":
return separator.join(ph)
return ph
def phonemize(self, text: str, separator: str = "", character: str = "hangeul", language=None) -> str:
return self._phonemize(text, separator, character)
@staticmethod
def supported_languages() -> Dict:
return {"ko-kr": "hangeul(korean)"}
def version(self) -> str:
return "0.0.2"
def is_available(self) -> bool:
return True
if __name__ == "__main__":
texts = "이 문장은 음성합성 테스트를 위한 문장입니다."
e = KO_KR_Phonemizer()
print(e.supported_languages())
print(e.version())
print(e.language)
print(e.name())
print(e.is_available())
print(e.phonemize(texts))
@@ -0,0 +1,65 @@
from typing import Dict, List
from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name
class MultiPhonemizer:
"""🐸TTS multi-phonemizer that operates phonemizers for multiple langugages
Args:
custom_lang_to_phonemizer (Dict):
Custom phonemizer mapping if you want to change the defaults. In the format of
`{"lang_code", "phonemizer_name"}`. When it is None, `DEF_LANG_TO_PHONEMIZER` is used. Defaults to `{}`.
TODO: find a way to pass custom kwargs to the phonemizers
"""
lang_to_phonemizer = {}
def __init__(self, lang_to_phonemizer_name: Dict = {}) -> None: # pylint: disable=dangerous-default-value
for k, v in lang_to_phonemizer_name.items():
if v == "" and k in DEF_LANG_TO_PHONEMIZER.keys():
lang_to_phonemizer_name[k] = DEF_LANG_TO_PHONEMIZER[k]
elif v == "":
raise ValueError(f"Phonemizer wasn't set for language {k} and doesn't have a default.")
self.lang_to_phonemizer_name = lang_to_phonemizer_name
self.lang_to_phonemizer = self.init_phonemizers(self.lang_to_phonemizer_name)
@staticmethod
def init_phonemizers(lang_to_phonemizer_name: Dict) -> Dict:
lang_to_phonemizer = {}
for k, v in lang_to_phonemizer_name.items():
lang_to_phonemizer[k] = get_phonemizer_by_name(v, language=k)
return lang_to_phonemizer
@staticmethod
def name():
return "multi-phonemizer"
def phonemize(self, text, separator="|", language=""):
if language == "":
raise ValueError("Language must be set for multi-phonemizer to phonemize.")
return self.lang_to_phonemizer[language].phonemize(text, separator)
def supported_languages(self) -> List:
return list(self.lang_to_phonemizer.keys())
def print_logs(self, level: int = 0):
indent = "\t" * level
print(f"{indent}| > phoneme language: {self.supported_languages()}")
print(f"{indent}| > phoneme backend: {self.name()}")
# if __name__ == "__main__":
# texts = {
# "tr": "Merhaba, bu Türkçe bit örnek!",
# "en-us": "Hello, this is English example!",
# "de": "Hallo, das ist ein Deutches Beipiel!",
# "zh-cn": "这是中国的例子",
# }
# phonemes = {}
# ph = MultiPhonemizer({"tr": "espeak", "en-us": "", "de": "gruut", "zh-cn": ""})
# for lang, text in texts.items():
# phoneme = ph.phonemize(text, lang)
# phonemes[lang] = phoneme
# print(phonemes)
@@ -0,0 +1,62 @@
from typing import Dict
from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes
from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
_DEF_ZH_PUNCS = "、.,[]()?!〽~『』「」【】"
class ZH_CN_Phonemizer(BasePhonemizer):
"""🐸TTS Zh-Cn phonemizer using functions in `TTS.tts.utils.text.chinese_mandarin.phonemizer`
Args:
punctuations (str):
Set of characters to be treated as punctuation. Defaults to `_DEF_ZH_PUNCS`.
keep_puncs (bool):
If True, keep the punctuations after phonemization. Defaults to False.
Example ::
"这是,样本中文。" -> `d|ʒ|ø|4| |ʂ|ʏ|4| || |i|ɑ|ŋ|4|b|œ|n|3| |d|ʒ|o|ŋ|1|w|œ|n|2| |。`
TODO: someone with Mandarin knowledge should check this implementation
"""
language = "zh-cn"
def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs): # pylint: disable=unused-argument
super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
@staticmethod
def name():
return "zh_cn_phonemizer"
@staticmethod
def phonemize_zh_cn(text: str, separator: str = "|") -> str:
ph = chinese_text_to_phonemes(text, separator)
return ph
def _phonemize(self, text, separator):
return self.phonemize_zh_cn(text, separator)
@staticmethod
def supported_languages() -> Dict:
return {"zh-cn": "Chinese (China)"}
def version(self) -> str:
return "0.0.1"
def is_available(self) -> bool:
return True
# if __name__ == "__main__":
# text = "这是,样本中文。"
# e = ZH_CN_Phonemizer()
# print(e.supported_languages())
# print(e.version())
# print(e.language)
# print(e.name())
# print(e.is_available())
# print("`" + e.phonemize(text) + "`")
+171
View File
@@ -0,0 +1,171 @@
import collections
import re
from enum import Enum
import six
_DEF_PUNCS = ';:,.!?¡¿—…"«»“”'
_PUNC_IDX = collections.namedtuple("_punc_index", ["punc", "position"])
class PuncPosition(Enum):
"""Enum for the punctuations positions"""
BEGIN = 0
END = 1
MIDDLE = 2
class Punctuation:
"""Handle punctuations in text.
Just strip punctuations from text or strip and restore them later.
Args:
puncs (str): The punctuations to be processed. Defaults to `_DEF_PUNCS`.
Example:
>>> punc = Punctuation()
>>> punc.strip("This is. example !")
'This is example'
>>> text_striped, punc_map = punc.strip_to_restore("This is. example !")
>>> ' '.join(text_striped)
'This is example'
>>> text_restored = punc.restore(text_striped, punc_map)
>>> text_restored[0]
'This is. example !'
"""
def __init__(self, puncs: str = _DEF_PUNCS):
self.puncs = puncs
@staticmethod
def default_puncs():
"""Return default set of punctuations."""
return _DEF_PUNCS
@property
def puncs(self):
return self._puncs
@puncs.setter
def puncs(self, value):
if not isinstance(value, six.string_types):
raise ValueError("[!] Punctuations must be of type str.")
self._puncs = "".join(list(dict.fromkeys(list(value)))) # remove duplicates without changing the oreder
self.puncs_regular_exp = re.compile(rf"(\s*[{re.escape(self._puncs)}]+\s*)+")
def strip(self, text):
"""Remove all the punctuations by replacing with `space`.
Args:
text (str): The text to be processed.
Example::
"This is. example !" -> "This is example "
"""
return re.sub(self.puncs_regular_exp, " ", text).rstrip().lstrip()
def strip_to_restore(self, text):
"""Remove punctuations from text to restore them later.
Args:
text (str): The text to be processed.
Examples ::
"This is. example !" -> [["This is", "example"], [".", "!"]]
"""
text, puncs = self._strip_to_restore(text)
return text, puncs
def _strip_to_restore(self, text):
"""Auxiliary method for Punctuation.preserve()"""
matches = list(re.finditer(self.puncs_regular_exp, text))
if not matches:
return [text], []
# the text is only punctuations
if len(matches) == 1 and matches[0].group() == text:
return [], [_PUNC_IDX(text, PuncPosition.BEGIN)]
# build a punctuation map to be used later to restore punctuations
puncs = []
for match in matches:
position = PuncPosition.MIDDLE
if match == matches[0] and text.startswith(match.group()):
position = PuncPosition.BEGIN
elif match == matches[-1] and text.endswith(match.group()):
position = PuncPosition.END
puncs.append(_PUNC_IDX(match.group(), position))
# convert str text to a List[str], each item is separated by a punctuation
splitted_text = []
for idx, punc in enumerate(puncs):
split = text.split(punc.punc)
prefix, suffix = split[0], punc.punc.join(split[1:])
text = suffix
if prefix == "":
# We don't want to insert an empty string in case of initial punctuation
continue
splitted_text.append(prefix)
# if the text does not end with a punctuation, add it to the last item
if idx == len(puncs) - 1 and len(suffix) > 0:
splitted_text.append(suffix)
return splitted_text, puncs
@classmethod
def restore(cls, text, puncs):
"""Restore punctuation in a text.
Args:
text (str): The text to be processed.
puncs (List[str]): The list of punctuations map to be used for restoring.
Examples ::
['This is', 'example'], ['.', '!'] -> "This is. example!"
"""
return cls._restore(text, puncs)
@classmethod
def _restore(cls, text, puncs): # pylint: disable=too-many-return-statements
"""Auxiliary method for Punctuation.restore()"""
if not puncs:
return text
# nothing have been phonemized, returns the puncs alone
if not text:
return ["".join(m.punc for m in puncs)]
current = puncs[0]
if current.position == PuncPosition.BEGIN:
return cls._restore([current.punc + text[0]] + text[1:], puncs[1:])
if current.position == PuncPosition.END:
return [text[0] + current.punc] + cls._restore(text[1:], puncs[1:])
# POSITION == MIDDLE
if len(text) == 1: # pragma: nocover
# a corner case where the final part of an intermediate
# mark (I) has not been phonemized
return cls._restore([text[0] + current.punc], puncs[1:])
return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:])
# if __name__ == "__main__":
# punc = Punctuation()
# text = "This is. This is, example!"
# print(punc.strip(text))
# split_text, puncs = punc.strip_to_restore(text)
# print(split_text, " ---- ", puncs)
# restored_text = punc.restore(split_text, puncs)
# print(restored_text)
+216
View File
@@ -0,0 +1,216 @@
from typing import Callable, Dict, List, Union
from TTS.tts.utils.text import cleaners
from TTS.tts.utils.text.characters import Graphemes, IPAPhonemes
from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name
from TTS.tts.utils.text.phonemizers.multi_phonemizer import MultiPhonemizer
from TTS.utils.generic_utils import get_import_path, import_class
class TTSTokenizer:
"""🐸TTS tokenizer to convert input characters to token IDs and back.
Token IDs for OOV chars are discarded but those are stored in `self.not_found_characters` for later.
Args:
use_phonemes (bool):
Whether to use phonemes instead of characters. Defaults to False.
characters (Characters):
A Characters object to use for character-to-ID and ID-to-character mappings.
text_cleaner (callable):
A function to pre-process the text before tokenization and phonemization. Defaults to None.
phonemizer (Phonemizer):
A phonemizer object or a dict that maps language codes to phonemizer objects. Defaults to None.
Example:
>>> from TTS.tts.utils.text.tokenizer import TTSTokenizer
>>> tokenizer = TTSTokenizer(use_phonemes=False, characters=Graphemes())
>>> text = "Hello world!"
>>> ids = tokenizer.text_to_ids(text)
>>> text_hat = tokenizer.ids_to_text(ids)
>>> assert text == text_hat
"""
def __init__(
self,
use_phonemes=False,
text_cleaner: Callable = None,
characters: "BaseCharacters" = None,
phonemizer: Union["Phonemizer", Dict] = None,
add_blank: bool = False,
use_eos_bos=False,
):
self.text_cleaner = text_cleaner
self.use_phonemes = use_phonemes
self.add_blank = add_blank
self.use_eos_bos = use_eos_bos
self.characters = characters
self.not_found_characters = []
self.phonemizer = phonemizer
@property
def characters(self):
return self._characters
@characters.setter
def characters(self, new_characters):
self._characters = new_characters
self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None
self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None
def encode(self, text: str) -> List[int]:
"""Encodes a string of text as a sequence of IDs."""
token_ids = []
for char in text:
try:
idx = self.characters.char_to_id(char)
token_ids.append(idx)
except KeyError:
# discard but store not found characters
if char not in self.not_found_characters:
self.not_found_characters.append(char)
print(text)
print(f" [!] Character {repr(char)} not found in the vocabulary. Discarding it.")
return token_ids
def decode(self, token_ids: List[int]) -> str:
"""Decodes a sequence of IDs to a string of text."""
text = ""
for token_id in token_ids:
text += self.characters.id_to_char(token_id)
return text
def text_to_ids(self, text: str, language: str = None) -> List[int]: # pylint: disable=unused-argument
"""Converts a string of text to a sequence of token IDs.
Args:
text(str):
The text to convert to token IDs.
language(str):
The language code of the text. Defaults to None.
TODO:
- Add support for language-specific processing.
1. Text normalizatin
2. Phonemization (if use_phonemes is True)
3. Add blank char between characters
4. Add BOS and EOS characters
5. Text to token IDs
"""
# TODO: text cleaner should pick the right routine based on the language
if self.text_cleaner is not None:
text = self.text_cleaner(text)
if self.use_phonemes:
text = self.phonemizer.phonemize(text, separator="", language=language)
text = self.encode(text)
if self.add_blank:
text = self.intersperse_blank_char(text, True)
if self.use_eos_bos:
text = self.pad_with_bos_eos(text)
return text
def ids_to_text(self, id_sequence: List[int]) -> str:
"""Converts a sequence of token IDs to a string of text."""
return self.decode(id_sequence)
def pad_with_bos_eos(self, char_sequence: List[str]):
"""Pads a sequence with the special BOS and EOS characters."""
return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id]
def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False):
"""Intersperses the blank character between characters in a sequence.
Use the ```blank``` character if defined else use the ```pad``` character.
"""
char_to_use = self.characters.blank_id if use_blank_char else self.characters.pad
result = [char_to_use] * (len(char_sequence) * 2 + 1)
result[1::2] = char_sequence
return result
def print_logs(self, level: int = 0):
indent = "\t" * level
print(f"{indent}| > add_blank: {self.add_blank}")
print(f"{indent}| > use_eos_bos: {self.use_eos_bos}")
print(f"{indent}| > use_phonemes: {self.use_phonemes}")
if self.use_phonemes:
print(f"{indent}| > phonemizer:")
self.phonemizer.print_logs(level + 1)
if len(self.not_found_characters) > 0:
print(f"{indent}| > {len(self.not_found_characters)} not found characters:")
for char in self.not_found_characters:
print(f"{indent}| > {char}")
@staticmethod
def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
"""Init Tokenizer object from config
Args:
config (Coqpit): Coqpit model config.
characters (BaseCharacters): Defines the model character set. If not set, use the default options based on
the config values. Defaults to None.
"""
# init cleaners
text_cleaner = None
if isinstance(config.text_cleaner, (str, list)):
text_cleaner = getattr(cleaners, config.text_cleaner)
# init characters
if characters is None:
# set characters based on defined characters class
if config.characters and config.characters.characters_class:
CharactersClass = import_class(config.characters.characters_class)
characters, new_config = CharactersClass.init_from_config(config)
# set characters based on config
else:
if config.use_phonemes:
# init phoneme set
characters, new_config = IPAPhonemes().init_from_config(config)
else:
# init character set
characters, new_config = Graphemes().init_from_config(config)
else:
characters, new_config = characters.init_from_config(config)
# set characters class
new_config.characters.characters_class = get_import_path(characters)
# init phonemizer
phonemizer = None
if config.use_phonemes:
if "phonemizer" in config and config.phonemizer == "multi_phonemizer":
lang_to_phonemizer_name = {}
for dataset in config.datasets:
if dataset.language != "":
lang_to_phonemizer_name[dataset.language] = dataset.phonemizer
else:
raise ValueError("Multi phonemizer requires language to be set for each dataset.")
phonemizer = MultiPhonemizer(lang_to_phonemizer_name)
else:
phonemizer_kwargs = {"language": config.phoneme_language}
if "phonemizer" in config and config.phonemizer:
phonemizer = get_phonemizer_by_name(config.phonemizer, **phonemizer_kwargs)
else:
try:
phonemizer = get_phonemizer_by_name(
DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs
)
new_config.phonemizer = phonemizer.name()
except KeyError as e:
raise ValueError(
f"""No phonemizer found for language {config.phoneme_language}.
You may need to install a third party library for this language."""
) from e
return (
TTSTokenizer(
config.use_phonemes, text_cleaner, characters, phonemizer, config.add_blank, config.enable_eos_bos_chars
),
new_config,
)
+238
View File
@@ -0,0 +1,238 @@
import librosa
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import torch
from matplotlib.colors import LogNorm
matplotlib.use("Agg")
def plot_alignment(alignment, info=None, fig_size=(16, 10), title=None, output_fig=False, plot_log=False):
if isinstance(alignment, torch.Tensor):
alignment_ = alignment.detach().cpu().numpy().squeeze()
else:
alignment_ = alignment
alignment_ = alignment_.astype(np.float32) if alignment_.dtype == np.float16 else alignment_
fig, ax = plt.subplots(figsize=fig_size)
im = ax.imshow(
alignment_.T, aspect="auto", origin="lower", interpolation="none", norm=LogNorm() if plot_log else None
)
fig.colorbar(im, ax=ax)
xlabel = "Decoder timestep"
if info is not None:
xlabel += "\n\n" + info
plt.xlabel(xlabel)
plt.ylabel("Encoder timestep")
# plt.yticks(range(len(text)), list(text))
plt.tight_layout()
if title is not None:
plt.title(title)
if not output_fig:
plt.close()
return fig
def plot_spectrogram(spectrogram, ap=None, fig_size=(16, 10), output_fig=False):
if isinstance(spectrogram, torch.Tensor):
spectrogram_ = spectrogram.detach().cpu().numpy().squeeze().T
else:
spectrogram_ = spectrogram.T
spectrogram_ = spectrogram_.astype(np.float32) if spectrogram_.dtype == np.float16 else spectrogram_
if ap is not None:
spectrogram_ = ap.denormalize(spectrogram_) # pylint: disable=protected-access
fig = plt.figure(figsize=fig_size)
plt.imshow(spectrogram_, aspect="auto", origin="lower")
plt.colorbar()
plt.tight_layout()
if not output_fig:
plt.close()
return fig
def plot_pitch(pitch, spectrogram, ap=None, fig_size=(30, 10), output_fig=False):
"""Plot pitch curves on top of the spectrogram.
Args:
pitch (np.array): Pitch values.
spectrogram (np.array): Spectrogram values.
Shapes:
pitch: :math:`(T,)`
spec: :math:`(C, T)`
"""
if isinstance(spectrogram, torch.Tensor):
spectrogram_ = spectrogram.detach().cpu().numpy().squeeze().T
else:
spectrogram_ = spectrogram.T
spectrogram_ = spectrogram_.astype(np.float32) if spectrogram_.dtype == np.float16 else spectrogram_
if ap is not None:
spectrogram_ = ap.denormalize(spectrogram_) # pylint: disable=protected-access
old_fig_size = plt.rcParams["figure.figsize"]
if fig_size is not None:
plt.rcParams["figure.figsize"] = fig_size
fig, ax = plt.subplots()
ax.imshow(spectrogram_, aspect="auto", origin="lower")
ax.set_xlabel("time")
ax.set_ylabel("spec_freq")
ax2 = ax.twinx()
ax2.plot(pitch, linewidth=5.0, color="red")
ax2.set_ylabel("F0")
plt.rcParams["figure.figsize"] = old_fig_size
if not output_fig:
plt.close()
return fig
def plot_avg_pitch(pitch, chars, fig_size=(30, 10), output_fig=False):
"""Plot pitch curves on top of the input characters.
Args:
pitch (np.array): Pitch values.
chars (str): Characters to place to the x-axis.
Shapes:
pitch: :math:`(T,)`
"""
old_fig_size = plt.rcParams["figure.figsize"]
if fig_size is not None:
plt.rcParams["figure.figsize"] = fig_size
fig, ax = plt.subplots()
x = np.array(range(len(chars)))
my_xticks = chars
plt.xticks(x, my_xticks)
ax.set_xlabel("characters")
ax.set_ylabel("freq")
ax2 = ax.twinx()
ax2.plot(pitch, linewidth=5.0, color="red")
ax2.set_ylabel("F0")
plt.rcParams["figure.figsize"] = old_fig_size
if not output_fig:
plt.close()
return fig
def plot_avg_energy(energy, chars, fig_size=(30, 10), output_fig=False):
"""Plot energy curves on top of the input characters.
Args:
energy (np.array): energy values.
chars (str): Characters to place to the x-axis.
Shapes:
energy: :math:`(T,)`
"""
old_fig_size = plt.rcParams["figure.figsize"]
if fig_size is not None:
plt.rcParams["figure.figsize"] = fig_size
fig, ax = plt.subplots()
x = np.array(range(len(chars)))
my_xticks = chars
plt.xticks(x, my_xticks)
ax.set_xlabel("characters")
ax.set_ylabel("freq")
ax2 = ax.twinx()
ax2.plot(energy, linewidth=5.0, color="red")
ax2.set_ylabel("energy")
plt.rcParams["figure.figsize"] = old_fig_size
if not output_fig:
plt.close()
return fig
def visualize(
alignment,
postnet_output,
text,
hop_length,
CONFIG,
tokenizer,
stop_tokens=None,
decoder_output=None,
output_path=None,
figsize=(8, 24),
output_fig=False,
):
"""Intended to be used in Notebooks."""
if decoder_output is not None:
num_plot = 4
else:
num_plot = 3
label_fontsize = 16
fig = plt.figure(figsize=figsize)
plt.subplot(num_plot, 1, 1)
plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
# compute phoneme representation and back
if CONFIG.use_phonemes:
seq = tokenizer.text_to_ids(text)
text = tokenizer.ids_to_text(seq)
print(text)
plt.yticks(range(len(text)), list(text))
plt.colorbar()
if stop_tokens is not None:
# plot stopnet predictions
plt.subplot(num_plot, 1, 2)
plt.plot(range(len(stop_tokens)), list(stop_tokens))
# plot postnet spectrogram
plt.subplot(num_plot, 1, 3)
librosa.display.specshow(
postnet_output.T,
sr=CONFIG.audio["sample_rate"],
hop_length=hop_length,
x_axis="time",
y_axis="linear",
fmin=CONFIG.audio["mel_fmin"],
fmax=CONFIG.audio["mel_fmax"],
)
plt.xlabel("Time", fontsize=label_fontsize)
plt.ylabel("Hz", fontsize=label_fontsize)
plt.tight_layout()
plt.colorbar()
if decoder_output is not None:
plt.subplot(num_plot, 1, 4)
librosa.display.specshow(
decoder_output.T,
sr=CONFIG.audio["sample_rate"],
hop_length=hop_length,
x_axis="time",
y_axis="linear",
fmin=CONFIG.audio["mel_fmin"],
fmax=CONFIG.audio["mel_fmax"],
)
plt.xlabel("Time", fontsize=label_fontsize)
plt.ylabel("Hz", fontsize=label_fontsize)
plt.tight_layout()
plt.colorbar()
if output_path:
print(output_path)
fig.savefig(output_path)
plt.close()
if not output_fig:
plt.close()