Add files via upload

2024-06-18 19:43:44 -07:00
parent 7d608044ef
commit 69cd493d03
97 changed files with 5916 additions and 0 deletions
@@ -0,0 +1 @@
+{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[STOP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SPACE]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[STOP]":0,"[UNK]":1,"[SPACE]":2,"!":3,"'":4,"(":5,")":6,",":7,"-":8,".":9,"/":10,":":11,";":12,"?":13,"a":14,"b":15,"c":16,"d":17,"e":18,"f":19,"g":20,"h":21,"i":22,"j":23,"k":24,"l":25,"m":26,"n":27,"o":28,"p":29,"q":30,"r":31,"s":32,"t":33,"u":34,"v":35,"w":36,"x":37,"y":38,"z":39,"th":40,"in":41,"the":42,"an":43,"er":44,"ou":45,"re":46,"on":47,"at":48,"ed":49,"en":50,"to":51,"ing":52,"and":53,"is":54,"as":55,"al":56,"or":57,"of":58,"ar":59,"it":60,"es":61,"he":62,"st":63,"le":64,"om":65,"se":66,"be":67,"ad":68,"ow":69,"ly":70,"ch":71,"wh":72,"that":73,"you":74,"li":75,"ve":76,"ac":77,"ti":78,"ld":79,"me":80,"was":81,"gh":82,"id":83,"ll":84,"wi":85,"ent":86,"for":87,"ay":88,"ro":89,"ver":90,"ic":91,"her":92,"ke":93,"his":94,"no":95,"ut":96,"un":97,"ir":98,"lo":99,"we":100,"ri":101,"ha":102,"with":103,"ght":104,"out":105,"im":106,"ion":107,"all":108,"ab":109,"one":110,"ne":111,"ge":112,"ould":113,"ter":114,"mo":115,"had":116,"ce":117,"she":118,"go":119,"sh":120,"ur":121,"am":122,"so":123,"pe":124,"my":125,"de":126,"are":127,"but":128,"ome":129,"fr":130,"ther":131,"fe":132,"su":133,"do":134,"con":135,"te":136,"ain":137,"ere":138,"po":139,"if":140,"they":141,"us":142,"ag":143,"tr":144,"now":145,"oun":146,"this":147,"have":148,"not":149,"sa":150,"il":151,"up":152,"thing":153,"from":154,"ap":155,"him":156,"ack":157,"ation":158,"ant":159,"our":160,"op":161,"like":162,"ust":163,"ess":164,"bo":165,"ok":166,"ul":167,"ind":168,"ex":169,"com":170,"some":171,"there":172,"ers":173,"co":174,"res":175,"man":176,"ard":177,"pl":178,"wor":179,"way":180,"tion":181,"fo":182,"ca":183,"were":184,"by":185,"ate":186,"pro":187,"ted":188,"ound":189,"own":190,"would":191,"ts":192,"what":193,"qu":194,"ally":195,"ight":196,"ck":197,"gr":198,"when":199,"ven":200,"can":201,"ough":202,"ine":203,"end":204,"per":205,"ous":206,"od":207,"ide":208,"know":209,"ty":210,"very":211,"si":212,"ak":213,"who":214,"about":215,"ill":216,"them":217,"est":218,"red":219,"ye":220,"could":221,"ong":222,"your":223,"their":224,"em":225,"just":226,"other":227,"into":228,"any":229,"whi":230,"um":231,"tw":232,"ast":233,"der":234,"did":235,"ie":236,"been":237,"ace":238,"ink":239,"ity":240,"back":241,"ting":242,"br":243,"more":244,"ake":245,"pp":246,"then":247,"sp":248,"el":249,"use":250,"bl":251,"said":252,"over":253,"get":254},"merges":["t h","i n","th e","a n","e r","o u","r e","o n","a t","e d","e n","t o","in g","an d","i s","a s","a l","o r","o f","a r","i t","e s","h e","s t","l e","o m","s e","b e","a d","o w","l y","c h","w h","th at","y ou","l i","v e","a c","t i","l d","m e","w as","g h","i d","l l","w i","en t","f or","a y","r o","v er","i c","h er","k e","h is","n o","u t","u n","i r","l o","w e","r i","h a","wi th","gh t","ou t","i m","i on","al l","a b","on e","n e","g e","ou ld","t er","m o","h ad","c e","s he","g o","s h","u r","a m","s o","p e","m y","d e","a re","b ut","om e","f r","the r","f e","s u","d o","c on","t e","a in","er e","p o","i f","the y","u s","a g","t r","n ow","ou n","th is","ha ve","no t","s a","i l","u p","th ing","fr om","a p","h im","ac k","at ion","an t","ou r","o p","li ke","u st","es s","b o","o k","u l","in d","e x","c om","s ome","the re","er s","c o","re s","m an","ar d","p l","w or","w ay","ti on","f o","c a","w ere","b y","at e","p ro","t ed","oun d","ow n","w ould","t s","wh at","q u","al ly","i ght","c k","g r","wh en","v en","c an","ou gh","in e","en d","p er","ou s","o d","id e","k now","t y","ver y","s i","a k","wh o","ab out","i ll","the m","es t","re d","y e","c ould","on g","you r","the ir","e m","j ust","o ther","in to","an y","wh i","u m","t w","as t","d er","d id","i e","be en","ac e","in k","it y","b ack","t ing","b r","mo re","a ke","p p","the n","s p","e l","u se","b l","sa id","o ver","ge t"]}}
@@ -0,0 +1,79 @@
+import bisect
+
+import numpy as np
+import torch
+
+
+def _pad_data(x, length):
+    _pad = 0
+    assert x.ndim == 1
+    return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=_pad)
+
+
+def prepare_data(inputs):
+    max_len = max((len(x) for x in inputs))
+    return np.stack([_pad_data(x, max_len) for x in inputs])
+
+
+def _pad_tensor(x, length):
+    _pad = 0.0
+    assert x.ndim == 2
+    x = np.pad(x, [[0, 0], [0, length - x.shape[1]]], mode="constant", constant_values=_pad)
+    return x
+
+
+def prepare_tensor(inputs, out_steps):
+    max_len = max((x.shape[1] for x in inputs))
+    remainder = max_len % out_steps
+    pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
+    return np.stack([_pad_tensor(x, pad_len) for x in inputs])
+
+
+def _pad_stop_target(x: np.ndarray, length: int, pad_val=1) -> np.ndarray:
+    """Pad stop target array.
+
+    Args:
+        x (np.ndarray): Stop target array.
+        length (int): Length after padding.
+        pad_val (int, optional): Padding value. Defaults to 1.
+
+    Returns:
+        np.ndarray: Padded stop target array.
+    """
+    assert x.ndim == 1
+    return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=pad_val)
+
+
+def prepare_stop_target(inputs, out_steps):
+    """Pad row vectors with 1."""
+    max_len = max((x.shape[0] for x in inputs))
+    remainder = max_len % out_steps
+    pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
+    return np.stack([_pad_stop_target(x, pad_len) for x in inputs])
+
+
+def pad_per_step(inputs, pad_len):
+    return np.pad(inputs, [[0, 0], [0, 0], [0, pad_len]], mode="constant", constant_values=0.0)
+
+
+def get_length_balancer_weights(items: list, num_buckets=10):
+    # get all durations
+    audio_lengths = np.array([item["audio_length"] for item in items])
+    # create the $num_buckets buckets classes based in the dataset max and min length
+    max_length = int(max(audio_lengths))
+    min_length = int(min(audio_lengths))
+    step = int((max_length - min_length) / num_buckets) + 1
+    buckets_classes = [i + step for i in range(min_length, (max_length - step) + num_buckets + 1, step)]
+    # add each sample in their respective length bucket
+    buckets_names = np.array(
+        [buckets_classes[bisect.bisect_left(buckets_classes, item["audio_length"])] for item in items]
+    )
+    # count and compute the weights_bucket for each sample
+    unique_buckets_names = np.unique(buckets_names).tolist()
+    bucket_ids = [unique_buckets_names.index(l) for l in buckets_names]
+    bucket_count = np.array([len(np.where(buckets_names == l)[0]) for l in unique_buckets_names])
+    weight_bucket = 1.0 / bucket_count
+    dataset_samples_weight = np.array([weight_bucket[l] for l in bucket_ids])
+    # normalize
+    dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
+    return torch.from_numpy(dataset_samples_weight).float()
@@ -0,0 +1,48 @@
+import torch
+
+
+def rehash_fairseq_vits_checkpoint(checkpoint_file):
+    chk = torch.load(checkpoint_file, map_location=torch.device("cpu"))["model"]
+    new_chk = {}
+    for k, v in chk.items():
+        if "enc_p." in k:
+            new_chk[k.replace("enc_p.", "text_encoder.")] = v
+        elif "dec." in k:
+            new_chk[k.replace("dec.", "waveform_decoder.")] = v
+        elif "enc_q." in k:
+            new_chk[k.replace("enc_q.", "posterior_encoder.")] = v
+        elif "flow.flows.2." in k:
+            new_chk[k.replace("flow.flows.2.", "flow.flows.1.")] = v
+        elif "flow.flows.4." in k:
+            new_chk[k.replace("flow.flows.4.", "flow.flows.2.")] = v
+        elif "flow.flows.6." in k:
+            new_chk[k.replace("flow.flows.6.", "flow.flows.3.")] = v
+        elif "dp.flows.0.m" in k:
+            new_chk[k.replace("dp.flows.0.m", "duration_predictor.flows.0.translation")] = v
+        elif "dp.flows.0.logs" in k:
+            new_chk[k.replace("dp.flows.0.logs", "duration_predictor.flows.0.log_scale")] = v
+        elif "dp.flows.1" in k:
+            new_chk[k.replace("dp.flows.1", "duration_predictor.flows.1")] = v
+        elif "dp.flows.3" in k:
+            new_chk[k.replace("dp.flows.3", "duration_predictor.flows.2")] = v
+        elif "dp.flows.5" in k:
+            new_chk[k.replace("dp.flows.5", "duration_predictor.flows.3")] = v
+        elif "dp.flows.7" in k:
+            new_chk[k.replace("dp.flows.7", "duration_predictor.flows.4")] = v
+        elif "dp.post_flows.0.m" in k:
+            new_chk[k.replace("dp.post_flows.0.m", "duration_predictor.post_flows.0.translation")] = v
+        elif "dp.post_flows.0.logs" in k:
+            new_chk[k.replace("dp.post_flows.0.logs", "duration_predictor.post_flows.0.log_scale")] = v
+        elif "dp.post_flows.1" in k:
+            new_chk[k.replace("dp.post_flows.1", "duration_predictor.post_flows.1")] = v
+        elif "dp.post_flows.3" in k:
+            new_chk[k.replace("dp.post_flows.3", "duration_predictor.post_flows.2")] = v
+        elif "dp.post_flows.5" in k:
+            new_chk[k.replace("dp.post_flows.5", "duration_predictor.post_flows.3")] = v
+        elif "dp.post_flows.7" in k:
+            new_chk[k.replace("dp.post_flows.7", "duration_predictor.post_flows.4")] = v
+        elif "dp." in k:
+            new_chk[k.replace("dp.", "duration_predictor.")] = v
+        else:
+            new_chk[k] = v
+    return new_chk
@@ -0,0 +1,258 @@
+import numpy as np
+import torch
+from scipy.stats import betabinom
+from torch.nn import functional as F
+
+try:
+    from TTS.tts.utils.monotonic_align.core import maximum_path_c
+
+    CYTHON = True
+except ModuleNotFoundError:
+    CYTHON = False
+
+
+class StandardScaler:
+    """StandardScaler for mean-scale normalization with the given mean and scale values."""
+
+    def __init__(self, mean: np.ndarray = None, scale: np.ndarray = None) -> None:
+        self.mean_ = mean
+        self.scale_ = scale
+
+    def set_stats(self, mean, scale):
+        self.mean_ = mean
+        self.scale_ = scale
+
+    def reset_stats(self):
+        delattr(self, "mean_")
+        delattr(self, "scale_")
+
+    def transform(self, X):
+        X = np.asarray(X)
+        X -= self.mean_
+        X /= self.scale_
+        return X
+
+    def inverse_transform(self, X):
+        X = np.asarray(X)
+        X *= self.scale_
+        X += self.mean_
+        return X
+
+
+# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1
+def sequence_mask(sequence_length, max_len=None):
+    """Create a sequence mask for filtering padding in a sequence tensor.
+
+    Args:
+        sequence_length (torch.tensor): Sequence lengths.
+        max_len (int, Optional): Maximum sequence length. Defaults to None.
+
+    Shapes:
+        - mask: :math:`[B, T_max]`
+    """
+    if max_len is None:
+        max_len = sequence_length.max()
+    seq_range = torch.arange(max_len, dtype=sequence_length.dtype, device=sequence_length.device)
+    # B x T_max
+    return seq_range.unsqueeze(0) < sequence_length.unsqueeze(1)
+
+
+def segment(x: torch.tensor, segment_indices: torch.tensor, segment_size=4, pad_short=False):
+    """Segment each sample in a batch based on the provided segment indices
+
+    Args:
+        x (torch.tensor): Input tensor.
+        segment_indices (torch.tensor): Segment indices.
+        segment_size (int): Expected output segment size.
+        pad_short (bool): Pad the end of input tensor with zeros if shorter than the segment size.
+    """
+    # pad the input tensor if it is shorter than the segment size
+    if pad_short and x.shape[-1] < segment_size:
+        x = torch.nn.functional.pad(x, (0, segment_size - x.size(2)))
+
+    segments = torch.zeros_like(x[:, :, :segment_size])
+
+    for i in range(x.size(0)):
+        index_start = segment_indices[i]
+        index_end = index_start + segment_size
+        x_i = x[i]
+        if pad_short and index_end >= x.size(2):
+            # pad the sample if it is shorter than the segment size
+            x_i = torch.nn.functional.pad(x_i, (0, (index_end + 1) - x.size(2)))
+        segments[i] = x_i[:, index_start:index_end]
+    return segments
+
+
+def rand_segments(
+    x: torch.tensor, x_lengths: torch.tensor = None, segment_size=4, let_short_samples=False, pad_short=False
+):
+    """Create random segments based on the input lengths.
+
+    Args:
+        x (torch.tensor): Input tensor.
+        x_lengths (torch.tensor): Input lengths.
+        segment_size (int): Expected output segment size.
+        let_short_samples (bool): Allow shorter samples than the segment size.
+        pad_short (bool): Pad the end of input tensor with zeros if shorter than the segment size.
+
+    Shapes:
+        - x: :math:`[B, C, T]`
+        - x_lengths: :math:`[B]`
+    """
+    _x_lenghts = x_lengths.clone()
+    B, _, T = x.size()
+    if pad_short:
+        if T < segment_size:
+            x = torch.nn.functional.pad(x, (0, segment_size - T))
+            T = segment_size
+    if _x_lenghts is None:
+        _x_lenghts = T
+    len_diff = _x_lenghts - segment_size
+    if let_short_samples:
+        _x_lenghts[len_diff < 0] = segment_size
+        len_diff = _x_lenghts - segment_size
+    else:
+        assert all(
+            len_diff > 0
+        ), f" [!] At least one sample is shorter than the segment size ({segment_size}). \n {_x_lenghts}"
+    segment_indices = (torch.rand([B]).type_as(x) * (len_diff + 1)).long()
+    ret = segment(x, segment_indices, segment_size, pad_short=pad_short)
+    return ret, segment_indices
+
+
+def average_over_durations(values, durs):
+    """Average values over durations.
+
+    Shapes:
+        - values: :math:`[B, 1, T_de]`
+        - durs: :math:`[B, T_en]`
+        - avg: :math:`[B, 1, T_en]`
+    """
+    durs_cums_ends = torch.cumsum(durs, dim=1).long()
+    durs_cums_starts = torch.nn.functional.pad(durs_cums_ends[:, :-1], (1, 0))
+    values_nonzero_cums = torch.nn.functional.pad(torch.cumsum(values != 0.0, dim=2), (1, 0))
+    values_cums = torch.nn.functional.pad(torch.cumsum(values, dim=2), (1, 0))
+
+    bs, l = durs_cums_ends.size()
+    n_formants = values.size(1)
+    dcs = durs_cums_starts[:, None, :].expand(bs, n_formants, l)
+    dce = durs_cums_ends[:, None, :].expand(bs, n_formants, l)
+
+    values_sums = (torch.gather(values_cums, 2, dce) - torch.gather(values_cums, 2, dcs)).float()
+    values_nelems = (torch.gather(values_nonzero_cums, 2, dce) - torch.gather(values_nonzero_cums, 2, dcs)).float()
+
+    avg = torch.where(values_nelems == 0.0, values_nelems, values_sums / values_nelems)
+    return avg
+
+
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+
+
+def generate_path(duration, mask):
+    """
+    Shapes:
+        - duration: :math:`[B, T_en]`
+        - mask: :math:'[B, T_en, T_de]`
+        - path: :math:`[B, T_en, T_de]`
+    """
+    b, t_x, t_y = mask.shape
+    cum_duration = torch.cumsum(duration, 1)
+
+    cum_duration_flat = cum_duration.view(b * t_x)
+    path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+    path = path.view(b, t_x, t_y)
+    path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+    path = path * mask
+    return path
+
+
+def maximum_path(value, mask):
+    if CYTHON:
+        return maximum_path_cython(value, mask)
+    return maximum_path_numpy(value, mask)
+
+
+def maximum_path_cython(value, mask):
+    """Cython optimised version.
+    Shapes:
+        - value: :math:`[B, T_en, T_de]`
+        - mask: :math:`[B, T_en, T_de]`
+    """
+    value = value * mask
+    device = value.device
+    dtype = value.dtype
+    value = value.data.cpu().numpy().astype(np.float32)
+    path = np.zeros_like(value).astype(np.int32)
+    mask = mask.data.cpu().numpy()
+
+    t_x_max = mask.sum(1)[:, 0].astype(np.int32)
+    t_y_max = mask.sum(2)[:, 0].astype(np.int32)
+    maximum_path_c(path, value, t_x_max, t_y_max)
+    return torch.from_numpy(path).to(device=device, dtype=dtype)
+
+
+def maximum_path_numpy(value, mask, max_neg_val=None):
+    """
+    Monotonic alignment search algorithm
+    Numpy-friendly version. It's about 4 times faster than torch version.
+    value: [b, t_x, t_y]
+    mask: [b, t_x, t_y]
+    """
+    if max_neg_val is None:
+        max_neg_val = -np.inf  # Patch for Sphinx complaint
+    value = value * mask
+
+    device = value.device
+    dtype = value.dtype
+    value = value.cpu().detach().numpy()
+    mask = mask.cpu().detach().numpy().astype(bool)
+
+    b, t_x, t_y = value.shape
+    direction = np.zeros(value.shape, dtype=np.int64)
+    v = np.zeros((b, t_x), dtype=np.float32)
+    x_range = np.arange(t_x, dtype=np.float32).reshape(1, -1)
+    for j in range(t_y):
+        v0 = np.pad(v, [[0, 0], [1, 0]], mode="constant", constant_values=max_neg_val)[:, :-1]
+        v1 = v
+        max_mask = v1 >= v0
+        v_max = np.where(max_mask, v1, v0)
+        direction[:, :, j] = max_mask
+
+        index_mask = x_range <= j
+        v = np.where(index_mask, v_max + value[:, :, j], max_neg_val)
+    direction = np.where(mask, direction, 1)
+
+    path = np.zeros(value.shape, dtype=np.float32)
+    index = mask[:, :, 0].sum(1).astype(np.int64) - 1
+    index_range = np.arange(b)
+    for j in reversed(range(t_y)):
+        path[index_range, index, j] = 1
+        index = index + direction[index_range, index, j] - 1
+    path = path * mask.astype(np.float32)
+    path = torch.from_numpy(path).to(device=device, dtype=dtype)
+    return path
+
+
+def beta_binomial_prior_distribution(phoneme_count, mel_count, scaling_factor=1.0):
+    P, M = phoneme_count, mel_count
+    x = np.arange(0, P)
+    mel_text_probs = []
+    for i in range(1, M + 1):
+        a, b = scaling_factor * i, scaling_factor * (M + 1 - i)
+        rv = betabinom(P, a, b)
+        mel_i_prob = rv.pmf(x)
+        mel_text_probs.append(mel_i_prob)
+    return np.array(mel_text_probs)
+
+
+def compute_attn_prior(x_len, y_len, scaling_factor=1.0):
+    """Compute attention priors for the alignment network."""
+    attn_prior = beta_binomial_prior_distribution(
+        x_len,
+        y_len,
+        scaling_factor,
+    )
+    return attn_prior  # [y_len, x_len]
@@ -0,0 +1,125 @@
+import os
+from typing import Any, Dict, List
+
+import fsspec
+import numpy as np
+import torch
+from coqpit import Coqpit
+
+from TTS.config import check_config_and_model_args
+from TTS.tts.utils.managers import BaseIDManager
+
+
+class LanguageManager(BaseIDManager):
+    """Manage the languages for multi-lingual 🐸TTS models. Load a datafile and parse the information
+    in a way that can be queried by language.
+
+    Args:
+        language_ids_file_path (str, optional): Path to the metafile that maps language names to ids used by
+        TTS models. Defaults to "".
+        config (Coqpit, optional): Coqpit config that contains the language information in the datasets filed.
+        Defaults to None.
+
+    Examples:
+        >>> manager = LanguageManager(language_ids_file_path=language_ids_file_path)
+        >>> language_id_mapper = manager.language_ids
+    """
+
+    def __init__(
+        self,
+        language_ids_file_path: str = "",
+        config: Coqpit = None,
+    ):
+        super().__init__(id_file_path=language_ids_file_path)
+
+        if config:
+            self.set_language_ids_from_config(config)
+
+    @property
+    def num_languages(self) -> int:
+        return len(list(self.name_to_id.keys()))
+
+    @property
+    def language_names(self) -> List:
+        return list(self.name_to_id.keys())
+
+    @staticmethod
+    def parse_language_ids_from_config(c: Coqpit) -> Dict:
+        """Set language id from config.
+
+        Args:
+            c (Coqpit): Config
+
+        Returns:
+            Tuple[Dict, int]: Language ID mapping and the number of languages.
+        """
+        languages = set({})
+        for dataset in c.datasets:
+            if "language" in dataset:
+                languages.add(dataset["language"])
+            else:
+                raise ValueError(f"Dataset {dataset['name']} has no language specified.")
+        return {name: i for i, name in enumerate(sorted(list(languages)))}
+
+    def set_language_ids_from_config(self, c: Coqpit) -> None:
+        """Set language IDs from config samples.
+
+        Args:
+            c (Coqpit): Config.
+        """
+        self.name_to_id = self.parse_language_ids_from_config(c)
+
+    @staticmethod
+    def parse_ids_from_data(items: List, parse_key: str) -> Any:
+        raise NotImplementedError
+
+    def set_ids_from_data(self, items: List, parse_key: str) -> Any:
+        raise NotImplementedError
+
+    def save_ids_to_file(self, file_path: str) -> None:
+        """Save language IDs to a json file.
+
+        Args:
+            file_path (str): Path to the output file.
+        """
+        self._save_json(file_path, self.name_to_id)
+
+    @staticmethod
+    def init_from_config(config: Coqpit) -> "LanguageManager":
+        """Initialize the language manager from a Coqpit config.
+
+        Args:
+            config (Coqpit): Coqpit config.
+        """
+        language_manager = None
+        if check_config_and_model_args(config, "use_language_embedding", True):
+            if config.get("language_ids_file", None):
+                language_manager = LanguageManager(language_ids_file_path=config.language_ids_file)
+            language_manager = LanguageManager(config=config)
+        return language_manager
+
+
+def _set_file_path(path):
+    """Find the language_ids.json under the given path or the above it.
+    Intended to band aid the different paths returned in restored and continued training."""
+    path_restore = os.path.join(os.path.dirname(path), "language_ids.json")
+    path_continue = os.path.join(path, "language_ids.json")
+    fs = fsspec.get_mapper(path).fs
+    if fs.exists(path_restore):
+        return path_restore
+    if fs.exists(path_continue):
+        return path_continue
+    return None
+
+
+def get_language_balancer_weights(items: list):
+    language_names = np.array([item["language"] for item in items])
+    unique_language_names = np.unique(language_names).tolist()
+    language_ids = [unique_language_names.index(l) for l in language_names]
+    language_count = np.array([len(np.where(language_names == l)[0]) for l in unique_language_names])
+    weight_language = 1.0 / language_count
+    # get weight for each sample
+    dataset_samples_weight = np.array([weight_language[l] for l in language_ids])
+    # normalize
+    dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
+    return torch.from_numpy(dataset_samples_weight).float()
@@ -0,0 +1,383 @@
+import json
+import random
+from typing import Any, Dict, List, Tuple, Union
+
+import fsspec
+import numpy as np
+import torch
+
+from TTS.config import load_config
+from TTS.encoder.utils.generic_utils import setup_encoder_model
+from TTS.utils.audio import AudioProcessor
+
+
+def load_file(path: str):
+    if path.endswith(".json"):
+        with fsspec.open(path, "r") as f:
+            return json.load(f)
+    elif path.endswith(".pth"):
+        with fsspec.open(path, "rb") as f:
+            return torch.load(f, map_location="cpu")
+    else:
+        raise ValueError("Unsupported file type")
+
+
+def save_file(obj: Any, path: str):
+    if path.endswith(".json"):
+        with fsspec.open(path, "w") as f:
+            json.dump(obj, f, indent=4)
+    elif path.endswith(".pth"):
+        with fsspec.open(path, "wb") as f:
+            torch.save(obj, f)
+    else:
+        raise ValueError("Unsupported file type")
+
+
+class BaseIDManager:
+    """Base `ID` Manager class. Every new `ID` manager must inherit this.
+    It defines common `ID` manager specific functions.
+    """
+
+    def __init__(self, id_file_path: str = ""):
+        self.name_to_id = {}
+
+        if id_file_path:
+            self.load_ids_from_file(id_file_path)
+
+    @staticmethod
+    def _load_json(json_file_path: str) -> Dict:
+        with fsspec.open(json_file_path, "r") as f:
+            return json.load(f)
+
+    @staticmethod
+    def _save_json(json_file_path: str, data: dict) -> None:
+        with fsspec.open(json_file_path, "w") as f:
+            json.dump(data, f, indent=4)
+
+    def set_ids_from_data(self, items: List, parse_key: str) -> None:
+        """Set IDs from data samples.
+
+        Args:
+            items (List): Data sampled returned by `load_tts_samples()`.
+        """
+        self.name_to_id = self.parse_ids_from_data(items, parse_key=parse_key)
+
+    def load_ids_from_file(self, file_path: str) -> None:
+        """Set IDs from a file.
+
+        Args:
+            file_path (str): Path to the file.
+        """
+        self.name_to_id = load_file(file_path)
+
+    def save_ids_to_file(self, file_path: str) -> None:
+        """Save IDs to a json file.
+
+        Args:
+            file_path (str): Path to the output file.
+        """
+        save_file(self.name_to_id, file_path)
+
+    def get_random_id(self) -> Any:
+        """Get a random embedding.
+
+        Args:
+
+        Returns:
+            np.ndarray: embedding.
+        """
+        if self.name_to_id:
+            return self.name_to_id[random.choices(list(self.name_to_id.keys()))[0]]
+
+        return None
+
+    @staticmethod
+    def parse_ids_from_data(items: List, parse_key: str) -> Tuple[Dict]:
+        """Parse IDs from data samples retured by `load_tts_samples()`.
+
+        Args:
+            items (list): Data sampled returned by `load_tts_samples()`.
+            parse_key (str): The key to being used to parse the data.
+        Returns:
+            Tuple[Dict]: speaker IDs.
+        """
+        classes = sorted({item[parse_key] for item in items})
+        ids = {name: i for i, name in enumerate(classes)}
+        return ids
+
+
+class EmbeddingManager(BaseIDManager):
+    """Base `Embedding` Manager class. Every new `Embedding` manager must inherit this.
+    It defines common `Embedding` manager specific functions.
+
+    It expects embeddings files in the following format:
+
+    ::
+
+        {
+            'audio_file_key':{
+                'name': 'category_name',
+                'embedding'[<embedding_values>]
+            },
+            ...
+        }
+
+    `audio_file_key` is a unique key to the audio file in the dataset. It can be the path to the file or any other unique key.
+    `embedding` is the embedding vector of the audio file.
+    `name` can be name of the speaker of the audio file.
+    """
+
+    def __init__(
+        self,
+        embedding_file_path: Union[str, List[str]] = "",
+        id_file_path: str = "",
+        encoder_model_path: str = "",
+        encoder_config_path: str = "",
+        use_cuda: bool = False,
+    ):
+        super().__init__(id_file_path=id_file_path)
+
+        self.embeddings = {}
+        self.embeddings_by_names = {}
+        self.clip_ids = []
+        self.encoder = None
+        self.encoder_ap = None
+        self.use_cuda = use_cuda
+
+        if embedding_file_path:
+            if isinstance(embedding_file_path, list):
+                self.load_embeddings_from_list_of_files(embedding_file_path)
+            else:
+                self.load_embeddings_from_file(embedding_file_path)
+
+        if encoder_model_path and encoder_config_path:
+            self.init_encoder(encoder_model_path, encoder_config_path, use_cuda)
+
+    @property
+    def num_embeddings(self):
+        """Get number of embeddings."""
+        return len(self.embeddings)
+
+    @property
+    def num_names(self):
+        """Get number of embeddings."""
+        return len(self.embeddings_by_names)
+
+    @property
+    def embedding_dim(self):
+        """Dimensionality of embeddings. If embeddings are not loaded, returns zero."""
+        if self.embeddings:
+            return len(self.embeddings[list(self.embeddings.keys())[0]]["embedding"])
+        return 0
+
+    @property
+    def embedding_names(self):
+        """Get embedding names."""
+        return list(self.embeddings_by_names.keys())
+
+    def save_embeddings_to_file(self, file_path: str) -> None:
+        """Save embeddings to a json file.
+
+        Args:
+            file_path (str): Path to the output file.
+        """
+        save_file(self.embeddings, file_path)
+
+    @staticmethod
+    def read_embeddings_from_file(file_path: str):
+        """Load embeddings from a json file.
+
+        Args:
+            file_path (str): Path to the file.
+        """
+        embeddings = load_file(file_path)
+        speakers = sorted({x["name"] for x in embeddings.values()})
+        name_to_id = {name: i for i, name in enumerate(speakers)}
+        clip_ids = list(set(sorted(clip_name for clip_name in embeddings.keys())))
+        # cache embeddings_by_names for fast inference using a bigger speakers.json
+        embeddings_by_names = {}
+        for x in embeddings.values():
+            if x["name"] not in embeddings_by_names.keys():
+                embeddings_by_names[x["name"]] = [x["embedding"]]
+            else:
+                embeddings_by_names[x["name"]].append(x["embedding"])
+        return name_to_id, clip_ids, embeddings, embeddings_by_names
+
+    def load_embeddings_from_file(self, file_path: str) -> None:
+        """Load embeddings from a json file.
+
+        Args:
+            file_path (str): Path to the target json file.
+        """
+        self.name_to_id, self.clip_ids, self.embeddings, self.embeddings_by_names = self.read_embeddings_from_file(
+            file_path
+        )
+
+    def load_embeddings_from_list_of_files(self, file_paths: List[str]) -> None:
+        """Load embeddings from a list of json files and don't allow duplicate keys.
+
+        Args:
+            file_paths (List[str]): List of paths to the target json files.
+        """
+        self.name_to_id = {}
+        self.clip_ids = []
+        self.embeddings_by_names = {}
+        self.embeddings = {}
+        for file_path in file_paths:
+            ids, clip_ids, embeddings, embeddings_by_names = self.read_embeddings_from_file(file_path)
+            # check colliding keys
+            duplicates = set(self.embeddings.keys()) & set(embeddings.keys())
+            if duplicates:
+                raise ValueError(f" [!] Duplicate embedding names <{duplicates}> in {file_path}")
+            # store values
+            self.name_to_id.update(ids)
+            self.clip_ids.extend(clip_ids)
+            self.embeddings_by_names.update(embeddings_by_names)
+            self.embeddings.update(embeddings)
+
+        # reset name_to_id to get the right speaker ids
+        self.name_to_id = {name: i for i, name in enumerate(self.name_to_id)}
+
+    def get_embedding_by_clip(self, clip_idx: str) -> List:
+        """Get embedding by clip ID.
+
+        Args:
+            clip_idx (str): Target clip ID.
+
+        Returns:
+            List: embedding as a list.
+        """
+        return self.embeddings[clip_idx]["embedding"]
+
+    def get_embeddings_by_name(self, idx: str) -> List[List]:
+        """Get all embeddings of a speaker.
+
+        Args:
+            idx (str): Target name.
+
+        Returns:
+            List[List]: all the embeddings of the given speaker.
+        """
+        return self.embeddings_by_names[idx]
+
+    def get_embeddings_by_names(self) -> Dict:
+        """Get all embeddings by names.
+
+        Returns:
+            Dict: all the embeddings of each speaker.
+        """
+        embeddings_by_names = {}
+        for x in self.embeddings.values():
+            if x["name"] not in embeddings_by_names.keys():
+                embeddings_by_names[x["name"]] = [x["embedding"]]
+            else:
+                embeddings_by_names[x["name"]].append(x["embedding"])
+        return embeddings_by_names
+
+    def get_mean_embedding(self, idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray:
+        """Get mean embedding of a idx.
+
+        Args:
+            idx (str): Target name.
+            num_samples (int, optional): Number of samples to be averaged. Defaults to None.
+            randomize (bool, optional): Pick random `num_samples` of embeddings. Defaults to False.
+
+        Returns:
+            np.ndarray: Mean embedding.
+        """
+        embeddings = self.get_embeddings_by_name(idx)
+        if num_samples is None:
+            embeddings = np.stack(embeddings).mean(0)
+        else:
+            assert len(embeddings) >= num_samples, f" [!] {idx} has number of samples < {num_samples}"
+            if randomize:
+                embeddings = np.stack(random.choices(embeddings, k=num_samples)).mean(0)
+            else:
+                embeddings = np.stack(embeddings[:num_samples]).mean(0)
+        return embeddings
+
+    def get_random_embedding(self) -> Any:
+        """Get a random embedding.
+
+        Args:
+
+        Returns:
+            np.ndarray: embedding.
+        """
+        if self.embeddings:
+            return self.embeddings[random.choices(list(self.embeddings.keys()))[0]]["embedding"]
+
+        return None
+
+    def get_clips(self) -> List:
+        return sorted(self.embeddings.keys())
+
+    def init_encoder(self, model_path: str, config_path: str, use_cuda=False) -> None:
+        """Initialize a speaker encoder model.
+
+        Args:
+            model_path (str): Model file path.
+            config_path (str): Model config file path.
+            use_cuda (bool, optional): Use CUDA. Defaults to False.
+        """
+        self.use_cuda = use_cuda
+        self.encoder_config = load_config(config_path)
+        self.encoder = setup_encoder_model(self.encoder_config)
+        self.encoder_criterion = self.encoder.load_checkpoint(
+            self.encoder_config, model_path, eval=True, use_cuda=use_cuda, cache=True
+        )
+        self.encoder_ap = AudioProcessor(**self.encoder_config.audio)
+
+    def compute_embedding_from_clip(self, wav_file: Union[str, List[str]]) -> list:
+        """Compute a embedding from a given audio file.
+
+        Args:
+            wav_file (Union[str, List[str]]): Target file path.
+
+        Returns:
+            list: Computed embedding.
+        """
+
+        def _compute(wav_file: str):
+            waveform = self.encoder_ap.load_wav(wav_file, sr=self.encoder_ap.sample_rate)
+            if not self.encoder_config.model_params.get("use_torch_spec", False):
+                m_input = self.encoder_ap.melspectrogram(waveform)
+                m_input = torch.from_numpy(m_input)
+            else:
+                m_input = torch.from_numpy(waveform)
+
+            if self.use_cuda:
+                m_input = m_input.cuda()
+            m_input = m_input.unsqueeze(0)
+            embedding = self.encoder.compute_embedding(m_input)
+            return embedding
+
+        if isinstance(wav_file, list):
+            # compute the mean embedding
+            embeddings = None
+            for wf in wav_file:
+                embedding = _compute(wf)
+                if embeddings is None:
+                    embeddings = embedding
+                else:
+                    embeddings += embedding
+            return (embeddings / len(wav_file))[0].tolist()
+        embedding = _compute(wav_file)
+        return embedding[0].tolist()
+
+    def compute_embeddings(self, feats: Union[torch.Tensor, np.ndarray]) -> List:
+        """Compute embedding from features.
+
+        Args:
+            feats (Union[torch.Tensor, np.ndarray]): Input features.
+
+        Returns:
+            List: computed embedding.
+        """
+        if isinstance(feats, np.ndarray):
+            feats = torch.from_numpy(feats)
+        if feats.ndim == 2:
+            feats = feats.unsqueeze(0)
+        if self.use_cuda:
+            feats = feats.cuda()
+        return self.encoder.compute_embedding(feats)
@@ -0,0 +1,15 @@
+def alignment_diagonal_score(alignments, binary=False):
+    """
+    Compute how diagonal alignment predictions are. It is useful
+    to measure the alignment consistency of a model
+    Args:
+        alignments (torch.Tensor): batch of alignments.
+        binary (bool): if True, ignore scores and consider attention
+        as a binary mask.
+    Shape:
+        - alignments : :math:`[B, T_de, T_en]`
+    """
+    maxs = alignments.max(dim=1)[0]
+    if binary:
+        maxs[maxs > 0] = 1
+    return maxs.mean(dim=1).mean(dim=0).item()
@@ -0,0 +1,47 @@
+import numpy as np
+
+cimport cython
+cimport numpy as np
+
+from cython.parallel import prange
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil:
+  cdef int x
+  cdef int y
+  cdef float v_prev
+  cdef float v_cur
+  cdef float tmp
+  cdef int index = t_x - 1
+
+  for y in range(t_y):
+    for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
+      if x == y:
+        v_cur = max_neg_val
+      else:
+        v_cur = value[x, y-1]
+      if x == 0:
+        if y == 0:
+          v_prev = 0.
+        else:
+          v_prev = max_neg_val
+      else:
+        v_prev = value[x-1, y-1]
+      value[x, y] = max(v_cur, v_prev) + value[x, y]
+
+  for y in range(t_y - 1, -1, -1):
+    path[index, y] = 1
+    if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]):
+      index = index - 1
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil:
+  cdef int b = values.shape[0]
+
+  cdef int i
+  for i in prange(b, nogil=True):
+    maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val)
@@ -0,0 +1,7 @@
+# from distutils.core import setup
+# from Cython.Build import cythonize
+# import numpy
+
+# setup(name='monotonic_align',
+#       ext_modules=cythonize("core.pyx"),
+#       include_dirs=[numpy.get_include()])
@@ -0,0 +1,222 @@
+import json
+import os
+from typing import Any, Dict, List, Union
+
+import fsspec
+import numpy as np
+import torch
+from coqpit import Coqpit
+
+from TTS.config import get_from_config_or_model_args_with_default
+from TTS.tts.utils.managers import EmbeddingManager
+
+
+class SpeakerManager(EmbeddingManager):
+    """Manage the speakers for multi-speaker 🐸TTS models. Load a datafile and parse the information
+    in a way that can be queried by speaker or clip.
+
+    There are 3 different scenarios considered:
+
+    1. Models using speaker embedding layers. The datafile only maps speaker names to ids used by the embedding layer.
+    2. Models using d-vectors. The datafile includes a dictionary in the following format.
+
+    ::
+
+        {
+            'clip_name.wav':{
+                'name': 'speakerA',
+                'embedding'[<d_vector_values>]
+            },
+            ...
+        }
+
+
+    3. Computing the d-vectors by the speaker encoder. It loads the speaker encoder model and
+    computes the d-vectors for a given clip or speaker.
+
+    Args:
+        d_vectors_file_path (str, optional): Path to the metafile including x vectors. Defaults to "".
+        speaker_id_file_path (str, optional): Path to the metafile that maps speaker names to ids used by
+        TTS models. Defaults to "".
+        encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "".
+        encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "".
+
+    Examples:
+        >>> # load audio processor and speaker encoder
+        >>> ap = AudioProcessor(**config.audio)
+        >>> manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path)
+        >>> # load a sample audio and compute embedding
+        >>> waveform = ap.load_wav(sample_wav_path)
+        >>> mel = ap.melspectrogram(waveform)
+        >>> d_vector = manager.compute_embeddings(mel.T)
+    """
+
+    def __init__(
+        self,
+        data_items: List[List[Any]] = None,
+        d_vectors_file_path: str = "",
+        speaker_id_file_path: str = "",
+        encoder_model_path: str = "",
+        encoder_config_path: str = "",
+        use_cuda: bool = False,
+    ):
+        super().__init__(
+            embedding_file_path=d_vectors_file_path,
+            id_file_path=speaker_id_file_path,
+            encoder_model_path=encoder_model_path,
+            encoder_config_path=encoder_config_path,
+            use_cuda=use_cuda,
+        )
+
+        if data_items:
+            self.set_ids_from_data(data_items, parse_key="speaker_name")
+
+    @property
+    def num_speakers(self):
+        return len(self.name_to_id)
+
+    @property
+    def speaker_names(self):
+        return list(self.name_to_id.keys())
+
+    def get_speakers(self) -> List:
+        return self.name_to_id
+
+    @staticmethod
+    def init_from_config(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "SpeakerManager":
+        """Initialize a speaker manager from config
+
+        Args:
+            config (Coqpit): Config object.
+            samples (Union[List[List], List[Dict]], optional): List of data samples to parse out the speaker names.
+                Defaults to None.
+
+        Returns:
+            SpeakerEncoder: Speaker encoder object.
+        """
+        speaker_manager = None
+        if get_from_config_or_model_args_with_default(config, "use_speaker_embedding", False):
+            if samples:
+                speaker_manager = SpeakerManager(data_items=samples)
+            if get_from_config_or_model_args_with_default(config, "speaker_file", None):
+                speaker_manager = SpeakerManager(
+                    speaker_id_file_path=get_from_config_or_model_args_with_default(config, "speaker_file", None)
+                )
+            if get_from_config_or_model_args_with_default(config, "speakers_file", None):
+                speaker_manager = SpeakerManager(
+                    speaker_id_file_path=get_from_config_or_model_args_with_default(config, "speakers_file", None)
+                )
+
+        if get_from_config_or_model_args_with_default(config, "use_d_vector_file", False):
+            speaker_manager = SpeakerManager()
+            if get_from_config_or_model_args_with_default(config, "d_vector_file", None):
+                speaker_manager = SpeakerManager(
+                    d_vectors_file_path=get_from_config_or_model_args_with_default(config, "d_vector_file", None)
+                )
+        return speaker_manager
+
+
+def _set_file_path(path):
+    """Find the speakers.json under the given path or the above it.
+    Intended to band aid the different paths returned in restored and continued training."""
+    path_restore = os.path.join(os.path.dirname(path), "speakers.json")
+    path_continue = os.path.join(path, "speakers.json")
+    fs = fsspec.get_mapper(path).fs
+    if fs.exists(path_restore):
+        return path_restore
+    if fs.exists(path_continue):
+        return path_continue
+    raise FileNotFoundError(f" [!] `speakers.json` not found in {path}")
+
+
+def load_speaker_mapping(out_path):
+    """Loads speaker mapping if already present."""
+    if os.path.splitext(out_path)[1] == ".json":
+        json_file = out_path
+    else:
+        json_file = _set_file_path(out_path)
+    with fsspec.open(json_file, "r") as f:
+        return json.load(f)
+
+
+def save_speaker_mapping(out_path, speaker_mapping):
+    """Saves speaker mapping if not yet present."""
+    if out_path is not None:
+        speakers_json_path = _set_file_path(out_path)
+        with fsspec.open(speakers_json_path, "w") as f:
+            json.dump(speaker_mapping, f, indent=4)
+
+
+def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, out_path: str = None) -> SpeakerManager:
+    """Initiate a `SpeakerManager` instance by the provided config.
+
+    Args:
+        c (Coqpit): Model configuration.
+        restore_path (str): Path to a previous training folder.
+        data (List): Data samples used in training to infer speakers from. It must be provided if speaker embedding
+            layers is used. Defaults to None.
+        out_path (str, optional): Save the generated speaker IDs to a output path. Defaults to None.
+
+    Returns:
+        SpeakerManager: initialized and ready to use instance.
+    """
+    speaker_manager = SpeakerManager()
+    if c.use_speaker_embedding:
+        if data is not None:
+            speaker_manager.set_ids_from_data(data, parse_key="speaker_name")
+        if restore_path:
+            speakers_file = _set_file_path(restore_path)
+            # restoring speaker manager from a previous run.
+            if c.use_d_vector_file:
+                # restore speaker manager with the embedding file
+                if not os.path.exists(speakers_file):
+                    print("WARNING: speakers.json was not found in restore_path, trying to use CONFIG.d_vector_file")
+                    if not os.path.exists(c.d_vector_file):
+                        raise RuntimeError(
+                            "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.d_vector_file"
+                        )
+                    speaker_manager.load_embeddings_from_file(c.d_vector_file)
+                speaker_manager.load_embeddings_from_file(speakers_file)
+            elif not c.use_d_vector_file:  # restor speaker manager with speaker ID file.
+                speaker_ids_from_data = speaker_manager.name_to_id
+                speaker_manager.load_ids_from_file(speakers_file)
+                assert all(
+                    speaker in speaker_manager.name_to_id for speaker in speaker_ids_from_data
+                ), " [!] You cannot introduce new speakers to a pre-trained model."
+        elif c.use_d_vector_file and c.d_vector_file:
+            # new speaker manager with external speaker embeddings.
+            speaker_manager.load_embeddings_from_file(c.d_vector_file)
+        elif c.use_d_vector_file and not c.d_vector_file:
+            raise "use_d_vector_file is True, so you need pass a external speaker embedding file."
+        elif c.use_speaker_embedding and "speakers_file" in c and c.speakers_file:
+            # new speaker manager with speaker IDs file.
+            speaker_manager.load_ids_from_file(c.speakers_file)
+
+        if speaker_manager.num_speakers > 0:
+            print(
+                " > Speaker manager is loaded with {} speakers: {}".format(
+                    speaker_manager.num_speakers, ", ".join(speaker_manager.name_to_id)
+                )
+            )
+
+        # save file if path is defined
+        if out_path:
+            out_file_path = os.path.join(out_path, "speakers.json")
+            print(f" > Saving `speakers.json` to {out_file_path}.")
+            if c.use_d_vector_file and c.d_vector_file:
+                speaker_manager.save_embeddings_to_file(out_file_path)
+            else:
+                speaker_manager.save_ids_to_file(out_file_path)
+    return speaker_manager
+
+
+def get_speaker_balancer_weights(items: list):
+    speaker_names = np.array([item["speaker_name"] for item in items])
+    unique_speaker_names = np.unique(speaker_names).tolist()
+    speaker_ids = [unique_speaker_names.index(l) for l in speaker_names]
+    speaker_count = np.array([len(np.where(speaker_names == l)[0]) for l in unique_speaker_names])
+    weight_speaker = 1.0 / speaker_count
+    dataset_samples_weight = np.array([weight_speaker[l] for l in speaker_ids])
+    # normalize
+    dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
+    return torch.from_numpy(dataset_samples_weight).float()
@@ -0,0 +1,383 @@
+# Adopted from https://github.com/photosynthesis-team/piq
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch.nn.modules.loss import _Loss
+
+
+def _reduce(x: torch.Tensor, reduction: str = "mean") -> torch.Tensor:
+    r"""Reduce input in batch dimension if needed.
+    Args:
+        x: Tensor with shape (N, *).
+        reduction: Specifies the reduction type:
+            ``'none'`` | ``'mean'`` | ``'sum'``. Default: ``'mean'``
+    """
+    if reduction == "none":
+        return x
+    if reduction == "mean":
+        return x.mean(dim=0)
+    if reduction == "sum":
+        return x.sum(dim=0)
+    raise ValueError("Unknown reduction. Expected one of {'none', 'mean', 'sum'}")
+
+
+def _validate_input(
+    tensors: List[torch.Tensor],
+    dim_range: Tuple[int, int] = (0, -1),
+    data_range: Tuple[float, float] = (0.0, -1.0),
+    # size_dim_range: Tuple[float, float] = (0., -1.),
+    size_range: Optional[Tuple[int, int]] = None,
+) -> None:
+    r"""Check that input(-s)  satisfies the requirements
+    Args:
+        tensors: Tensors to check
+        dim_range: Allowed number of dimensions. (min, max)
+        data_range: Allowed range of values in tensors. (min, max)
+        size_range: Dimensions to include in size comparison. (start_dim, end_dim + 1)
+    """
+
+    if not __debug__:
+        return
+
+    x = tensors[0]
+
+    for t in tensors:
+        assert torch.is_tensor(t), f"Expected torch.Tensor, got {type(t)}"
+        assert t.device == x.device, f"Expected tensors to be on {x.device}, got {t.device}"
+
+        if size_range is None:
+            assert t.size() == x.size(), f"Expected tensors with same size, got {t.size()} and {x.size()}"
+        else:
+            assert (
+                t.size()[size_range[0] : size_range[1]] == x.size()[size_range[0] : size_range[1]]
+            ), f"Expected tensors with same size at given dimensions, got {t.size()} and {x.size()}"
+
+        if dim_range[0] == dim_range[1]:
+            assert t.dim() == dim_range[0], f"Expected number of dimensions to be {dim_range[0]}, got {t.dim()}"
+        elif dim_range[0] < dim_range[1]:
+            assert (
+                dim_range[0] <= t.dim() <= dim_range[1]
+            ), f"Expected number of dimensions to be between {dim_range[0]} and {dim_range[1]}, got {t.dim()}"
+
+        if data_range[0] < data_range[1]:
+            assert data_range[0] <= t.min(), f"Expected values to be greater or equal to {data_range[0]}, got {t.min()}"
+            assert t.max() <= data_range[1], f"Expected values to be lower or equal to {data_range[1]}, got {t.max()}"
+
+
+def gaussian_filter(kernel_size: int, sigma: float) -> torch.Tensor:
+    r"""Returns 2D Gaussian kernel N(0,`sigma`^2)
+    Args:
+        size: Size of the kernel
+        sigma: Std of the distribution
+    Returns:
+        gaussian_kernel: Tensor with shape (1, kernel_size, kernel_size)
+    """
+    coords = torch.arange(kernel_size, dtype=torch.float32)
+    coords -= (kernel_size - 1) / 2.0
+
+    g = coords**2
+    g = (-(g.unsqueeze(0) + g.unsqueeze(1)) / (2 * sigma**2)).exp()
+
+    g /= g.sum()
+    return g.unsqueeze(0)
+
+
+def ssim(
+    x: torch.Tensor,
+    y: torch.Tensor,
+    kernel_size: int = 11,
+    kernel_sigma: float = 1.5,
+    data_range: Union[int, float] = 1.0,
+    reduction: str = "mean",
+    full: bool = False,
+    downsample: bool = True,
+    k1: float = 0.01,
+    k2: float = 0.03,
+) -> List[torch.Tensor]:
+    r"""Interface of Structural Similarity (SSIM) index.
+    Inputs supposed to be in range ``[0, data_range]``.
+    To match performance with skimage and tensorflow set ``'downsample' = True``.
+
+    Args:
+        x: An input tensor. Shape :math:`(N, C, H, W)` or :math:`(N, C, H, W, 2)`.
+        y: A target tensor. Shape :math:`(N, C, H, W)` or :math:`(N, C, H, W, 2)`.
+        kernel_size: The side-length of the sliding window used in comparison. Must be an odd value.
+        kernel_sigma: Sigma of normal distribution.
+        data_range: Maximum value range of images (usually 1.0 or 255).
+        reduction: Specifies the reduction type:
+            ``'none'`` | ``'mean'`` | ``'sum'``. Default:``'mean'``
+        full: Return cs map or not.
+        downsample: Perform average pool before SSIM computation. Default: True
+        k1: Algorithm parameter, K1 (small constant).
+        k2: Algorithm parameter, K2 (small constant).
+            Try a larger K2 constant (e.g. 0.4) if you get a negative or NaN results.
+
+    Returns:
+        Value of Structural Similarity (SSIM) index. In case of 5D input tensors, complex value is returned
+        as a tensor of size 2.
+
+    References:
+        Wang, Z., Bovik, A. C., Sheikh, H. R., & Simoncelli, E. P. (2004).
+        Image quality assessment: From error visibility to structural similarity.
+        IEEE Transactions on Image Processing, 13, 600-612.
+        https://ece.uwaterloo.ca/~z70wang/publications/ssim.pdf,
+        DOI: `10.1109/TIP.2003.819861`
+    """
+    assert kernel_size % 2 == 1, f"Kernel size must be odd, got [{kernel_size}]"
+    _validate_input([x, y], dim_range=(4, 5), data_range=(0, data_range))
+
+    x = x / float(data_range)
+    y = y / float(data_range)
+
+    # Averagepool image if the size is large enough
+    f = max(1, round(min(x.size()[-2:]) / 256))
+    if (f > 1) and downsample:
+        x = F.avg_pool2d(x, kernel_size=f)
+        y = F.avg_pool2d(y, kernel_size=f)
+
+    kernel = gaussian_filter(kernel_size, kernel_sigma).repeat(x.size(1), 1, 1, 1).to(y)
+    _compute_ssim_per_channel = _ssim_per_channel_complex if x.dim() == 5 else _ssim_per_channel
+    ssim_map, cs_map = _compute_ssim_per_channel(x=x, y=y, kernel=kernel, k1=k1, k2=k2)
+    ssim_val = ssim_map.mean(1)
+    cs = cs_map.mean(1)
+
+    ssim_val = _reduce(ssim_val, reduction)
+    cs = _reduce(cs, reduction)
+
+    if full:
+        return [ssim_val, cs]
+
+    return ssim_val
+
+
+class SSIMLoss(_Loss):
+    r"""Creates a criterion that measures the structural similarity index error between
+    each element in the input :math:`x` and target :math:`y`.
+
+    To match performance with skimage and tensorflow set ``'downsample' = True``.
+
+    The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
+
+    .. math::
+        SSIM = \{ssim_1,\dots,ssim_{N \times C}\}\\
+        ssim_{l}(x, y) = \frac{(2 \mu_x \mu_y + c_1) (2 \sigma_{xy} + c_2)}
+        {(\mu_x^2 +\mu_y^2 + c_1)(\sigma_x^2 +\sigma_y^2 + c_2)},
+
+    where :math:`N` is the batch size, `C` is the channel size. If :attr:`reduction` is not ``'none'``
+    (default ``'mean'``), then:
+
+    .. math::
+        SSIMLoss(x, y) =
+        \begin{cases}
+            \operatorname{mean}(1 - SSIM), &  \text{if reduction} = \text{'mean';}\\
+            \operatorname{sum}(1 - SSIM),  &  \text{if reduction} = \text{'sum'.}
+        \end{cases}
+
+    :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
+    of :math:`n` elements each.
+
+    The sum operation still operates over all the elements, and divides by :math:`n`.
+    The division by :math:`n` can be avoided if one sets ``reduction = 'sum'``.
+    In case of 5D input tensors, complex value is returned as a tensor of size 2.
+
+    Args:
+        kernel_size: By default, the mean and covariance of a pixel is obtained
+            by convolution with given filter_size.
+        kernel_sigma: Standard deviation for Gaussian kernel.
+        k1: Coefficient related to c1 in the above equation.
+        k2: Coefficient related to c2 in the above equation.
+        downsample: Perform average pool before SSIM computation. Default: True
+        reduction: Specifies the reduction type:
+            ``'none'`` | ``'mean'`` | ``'sum'``. Default:``'mean'``
+        data_range: Maximum value range of images (usually 1.0 or 255).
+
+    Examples:
+        >>> loss = SSIMLoss()
+        >>> x = torch.rand(3, 3, 256, 256, requires_grad=True)
+        >>> y = torch.rand(3, 3, 256, 256)
+        >>> output = loss(x, y)
+        >>> output.backward()
+
+    References:
+        Wang, Z., Bovik, A. C., Sheikh, H. R., & Simoncelli, E. P. (2004).
+        Image quality assessment: From error visibility to structural similarity.
+        IEEE Transactions on Image Processing, 13, 600-612.
+        https://ece.uwaterloo.ca/~z70wang/publications/ssim.pdf,
+        DOI:`10.1109/TIP.2003.819861`
+    """
+    __constants__ = ["kernel_size", "k1", "k2", "sigma", "kernel", "reduction"]
+
+    def __init__(
+        self,
+        kernel_size: int = 11,
+        kernel_sigma: float = 1.5,
+        k1: float = 0.01,
+        k2: float = 0.03,
+        downsample: bool = True,
+        reduction: str = "mean",
+        data_range: Union[int, float] = 1.0,
+    ) -> None:
+        super().__init__()
+
+        # Generic loss parameters.
+        self.reduction = reduction
+
+        # Loss-specific parameters.
+        self.kernel_size = kernel_size
+
+        # This check might look redundant because kernel size is checked within the ssim function anyway.
+        # However, this check allows to fail fast when the loss is being initialised and training has not been started.
+        assert kernel_size % 2 == 1, f"Kernel size must be odd, got [{kernel_size}]"
+        self.kernel_sigma = kernel_sigma
+        self.k1 = k1
+        self.k2 = k2
+        self.downsample = downsample
+        self.data_range = data_range
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        r"""Computation of Structural Similarity (SSIM) index as a loss function.
+
+        Args:
+            x: An input tensor. Shape :math:`(N, C, H, W)` or :math:`(N, C, H, W, 2)`.
+            y: A target tensor. Shape :math:`(N, C, H, W)` or :math:`(N, C, H, W, 2)`.
+
+        Returns:
+            Value of SSIM loss to be minimized, i.e ``1 - ssim`` in [0, 1] range. In case of 5D input tensors,
+            complex value is returned as a tensor of size 2.
+        """
+
+        score = ssim(
+            x=x,
+            y=y,
+            kernel_size=self.kernel_size,
+            kernel_sigma=self.kernel_sigma,
+            downsample=self.downsample,
+            data_range=self.data_range,
+            reduction=self.reduction,
+            full=False,
+            k1=self.k1,
+            k2=self.k2,
+        )
+        return torch.ones_like(score) - score
+
+
+def _ssim_per_channel(
+    x: torch.Tensor,
+    y: torch.Tensor,
+    kernel: torch.Tensor,
+    k1: float = 0.01,
+    k2: float = 0.03,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    r"""Calculate Structural Similarity (SSIM) index for X and Y per channel.
+
+    Args:
+        x: An input tensor. Shape :math:`(N, C, H, W)`.
+        y: A target tensor. Shape :math:`(N, C, H, W)`.
+        kernel: 2D Gaussian kernel.
+        k1: Algorithm parameter, K1 (small constant, see [1]).
+        k2: Algorithm parameter, K2 (small constant, see [1]).
+            Try a larger K2 constant (e.g. 0.4) if you get a negative or NaN results.
+
+    Returns:
+        Full Value of Structural Similarity (SSIM) index.
+    """
+    if x.size(-1) < kernel.size(-1) or x.size(-2) < kernel.size(-2):
+        raise ValueError(
+            f"Kernel size can't be greater than actual input size. Input size: {x.size()}. "
+            f"Kernel size: {kernel.size()}"
+        )
+
+    c1 = k1**2
+    c2 = k2**2
+    n_channels = x.size(1)
+    mu_x = F.conv2d(x, weight=kernel, stride=1, padding=0, groups=n_channels)
+    mu_y = F.conv2d(y, weight=kernel, stride=1, padding=0, groups=n_channels)
+
+    mu_xx = mu_x**2
+    mu_yy = mu_y**2
+    mu_xy = mu_x * mu_y
+
+    sigma_xx = F.conv2d(x**2, weight=kernel, stride=1, padding=0, groups=n_channels) - mu_xx
+    sigma_yy = F.conv2d(y**2, weight=kernel, stride=1, padding=0, groups=n_channels) - mu_yy
+    sigma_xy = F.conv2d(x * y, weight=kernel, stride=1, padding=0, groups=n_channels) - mu_xy
+
+    # Contrast sensitivity (CS) with alpha = beta = gamma = 1.
+    cs = (2.0 * sigma_xy + c2) / (sigma_xx + sigma_yy + c2)
+
+    # Structural similarity (SSIM)
+    ss = (2.0 * mu_xy + c1) / (mu_xx + mu_yy + c1) * cs
+
+    ssim_val = ss.mean(dim=(-1, -2))
+    cs = cs.mean(dim=(-1, -2))
+    return ssim_val, cs
+
+
+def _ssim_per_channel_complex(
+    x: torch.Tensor,
+    y: torch.Tensor,
+    kernel: torch.Tensor,
+    k1: float = 0.01,
+    k2: float = 0.03,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    r"""Calculate Structural Similarity (SSIM) index for Complex X and Y per channel.
+
+    Args:
+        x: An input tensor. Shape :math:`(N, C, H, W, 2)`.
+        y: A target tensor. Shape :math:`(N, C, H, W, 2)`.
+        kernel: 2-D gauss kernel.
+        k1: Algorithm parameter, K1 (small constant, see [1]).
+        k2: Algorithm parameter, K2 (small constant, see [1]).
+            Try a larger K2 constant (e.g. 0.4) if you get a negative or NaN results.
+
+    Returns:
+        Full Value of Complex Structural Similarity (SSIM) index.
+    """
+    n_channels = x.size(1)
+    if x.size(-2) < kernel.size(-1) or x.size(-3) < kernel.size(-2):
+        raise ValueError(
+            f"Kernel size can't be greater than actual input size. Input size: {x.size()}. "
+            f"Kernel size: {kernel.size()}"
+        )
+
+    c1 = k1**2
+    c2 = k2**2
+
+    x_real = x[..., 0]
+    x_imag = x[..., 1]
+    y_real = y[..., 0]
+    y_imag = y[..., 1]
+
+    mu1_real = F.conv2d(x_real, weight=kernel, stride=1, padding=0, groups=n_channels)
+    mu1_imag = F.conv2d(x_imag, weight=kernel, stride=1, padding=0, groups=n_channels)
+    mu2_real = F.conv2d(y_real, weight=kernel, stride=1, padding=0, groups=n_channels)
+    mu2_imag = F.conv2d(y_imag, weight=kernel, stride=1, padding=0, groups=n_channels)
+
+    mu1_sq = mu1_real.pow(2) + mu1_imag.pow(2)
+    mu2_sq = mu2_real.pow(2) + mu2_imag.pow(2)
+    mu1_mu2_real = mu1_real * mu2_real - mu1_imag * mu2_imag
+    mu1_mu2_imag = mu1_real * mu2_imag + mu1_imag * mu2_real
+
+    compensation = 1.0
+
+    x_sq = x_real.pow(2) + x_imag.pow(2)
+    y_sq = y_real.pow(2) + y_imag.pow(2)
+    x_y_real = x_real * y_real - x_imag * y_imag
+    x_y_imag = x_real * y_imag + x_imag * y_real
+
+    sigma1_sq = F.conv2d(x_sq, weight=kernel, stride=1, padding=0, groups=n_channels) - mu1_sq
+    sigma2_sq = F.conv2d(y_sq, weight=kernel, stride=1, padding=0, groups=n_channels) - mu2_sq
+    sigma12_real = F.conv2d(x_y_real, weight=kernel, stride=1, padding=0, groups=n_channels) - mu1_mu2_real
+    sigma12_imag = F.conv2d(x_y_imag, weight=kernel, stride=1, padding=0, groups=n_channels) - mu1_mu2_imag
+    sigma12 = torch.stack((sigma12_imag, sigma12_real), dim=-1)
+    mu1_mu2 = torch.stack((mu1_mu2_real, mu1_mu2_imag), dim=-1)
+    # Set alpha = beta = gamma = 1.
+    cs_map = (sigma12 * 2 + c2 * compensation) / (sigma1_sq.unsqueeze(-1) + sigma2_sq.unsqueeze(-1) + c2 * compensation)
+    ssim_map = (mu1_mu2 * 2 + c1 * compensation) / (mu1_sq.unsqueeze(-1) + mu2_sq.unsqueeze(-1) + c1 * compensation)
+    ssim_map = ssim_map * cs_map
+
+    ssim_val = ssim_map.mean(dim=(-2, -3))
+    cs = cs_map.mean(dim=(-2, -3))
+
+    return ssim_val, cs
@@ -0,0 +1,343 @@
+from typing import Dict
+
+import numpy as np
+import torch
+from torch import nn
+
+
+def numpy_to_torch(np_array, dtype, cuda=False, device="cpu"):
+    if cuda:
+        device = "cuda"
+    if np_array is None:
+        return None
+    tensor = torch.as_tensor(np_array, dtype=dtype, device=device)
+    return tensor
+
+
+def compute_style_mel(style_wav, ap, cuda=False, device="cpu"):
+    if cuda:
+        device = "cuda"
+    style_mel = torch.FloatTensor(
+        ap.melspectrogram(ap.load_wav(style_wav, sr=ap.sample_rate)),
+        device=device,
+    ).unsqueeze(0)
+    return style_mel
+
+
+def run_model_torch(
+    model: nn.Module,
+    inputs: torch.Tensor,
+    speaker_id: int = None,
+    style_mel: torch.Tensor = None,
+    style_text: str = None,
+    d_vector: torch.Tensor = None,
+    language_id: torch.Tensor = None,
+) -> Dict:
+    """Run a torch model for inference. It does not support batch inference.
+
+    Args:
+        model (nn.Module): The model to run inference.
+        inputs (torch.Tensor): Input tensor with character ids.
+        speaker_id (int, optional): Input speaker ids for multi-speaker models. Defaults to None.
+        style_mel (torch.Tensor, optional): Spectrograms used for voice styling . Defaults to None.
+        d_vector (torch.Tensor, optional): d-vector for multi-speaker models    . Defaults to None.
+
+    Returns:
+        Dict: model outputs.
+    """
+    input_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device)
+    if hasattr(model, "module"):
+        _func = model.module.inference
+    else:
+        _func = model.inference
+    outputs = _func(
+        inputs,
+        aux_input={
+            "x_lengths": input_lengths,
+            "speaker_ids": speaker_id,
+            "d_vectors": d_vector,
+            "style_mel": style_mel,
+            "style_text": style_text,
+            "language_ids": language_id,
+        },
+    )
+    return outputs
+
+
+def trim_silence(wav, ap):
+    return wav[: ap.find_endpoint(wav)]
+
+
+def inv_spectrogram(postnet_output, ap, CONFIG):
+    if CONFIG.model.lower() in ["tacotron"]:
+        wav = ap.inv_spectrogram(postnet_output.T)
+    else:
+        wav = ap.inv_melspectrogram(postnet_output.T)
+    return wav
+
+
+def id_to_torch(aux_id, cuda=False, device="cpu"):
+    if cuda:
+        device = "cuda"
+    if aux_id is not None:
+        aux_id = np.asarray(aux_id)
+        aux_id = torch.from_numpy(aux_id).to(device)
+    return aux_id
+
+
+def embedding_to_torch(d_vector, cuda=False, device="cpu"):
+    if cuda:
+        device = "cuda"
+    if d_vector is not None:
+        d_vector = np.asarray(d_vector)
+        d_vector = torch.from_numpy(d_vector).type(torch.FloatTensor)
+        d_vector = d_vector.squeeze().unsqueeze(0).to(device)
+    return d_vector
+
+
+# TODO: perform GL with pytorch for batching
+def apply_griffin_lim(inputs, input_lens, CONFIG, ap):
+    """Apply griffin-lim to each sample iterating throught the first dimension.
+    Args:
+        inputs (Tensor or np.Array): Features to be converted by GL. First dimension is the batch size.
+        input_lens (Tensor or np.Array): 1D array of sample lengths.
+        CONFIG (Dict): TTS config.
+        ap (AudioProcessor): TTS audio processor.
+    """
+    wavs = []
+    for idx, spec in enumerate(inputs):
+        wav_len = (input_lens[idx] * ap.hop_length) - ap.hop_length  # inverse librosa padding
+        wav = inv_spectrogram(spec, ap, CONFIG)
+        # assert len(wav) == wav_len, f" [!] wav lenght: {len(wav)} vs expected: {wav_len}"
+        wavs.append(wav[:wav_len])
+    return wavs
+
+
+def synthesis(
+    model,
+    text,
+    CONFIG,
+    use_cuda,
+    speaker_id=None,
+    style_wav=None,
+    style_text=None,
+    use_griffin_lim=False,
+    do_trim_silence=False,
+    d_vector=None,
+    language_id=None,
+):
+    """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to
+    the vocoder model.
+
+    Args:
+        model (TTS.tts.models):
+            The TTS model to synthesize audio with.
+
+        text (str):
+            The input text to convert to speech.
+
+        CONFIG (Coqpit):
+            Model configuration.
+
+        use_cuda (bool):
+            Enable/disable CUDA.
+
+        speaker_id (int):
+            Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.
+
+        style_wav (str | Dict[str, float]):
+            Path or tensor to/of a waveform used for computing the style embedding based on GST or Capacitron.
+            Defaults to None, meaning that Capacitron models will sample from the prior distribution to
+            generate random but realistic prosody.
+
+        style_text (str):
+            Transcription of style_wav for Capacitron models. Defaults to None.
+
+        enable_eos_bos_chars (bool):
+            enable special chars for end of sentence and start of sentence. Defaults to False.
+
+        do_trim_silence (bool):
+            trim silence after synthesis. Defaults to False.
+
+        d_vector (torch.Tensor):
+            d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None.
+
+        language_id (int):
+            Language ID passed to the language embedding layer in multi-langual model. Defaults to None.
+    """
+    # device
+    device = next(model.parameters()).device
+    if use_cuda:
+        device = "cuda"
+
+    # GST or Capacitron processing
+    # TODO: need to handle the case of setting both gst and capacitron to true somewhere
+    style_mel = None
+    if CONFIG.has("gst") and CONFIG.gst and style_wav is not None:
+        if isinstance(style_wav, dict):
+            style_mel = style_wav
+        else:
+            style_mel = compute_style_mel(style_wav, model.ap, device=device)
+
+    if CONFIG.has("capacitron_vae") and CONFIG.use_capacitron_vae and style_wav is not None:
+        style_mel = compute_style_mel(style_wav, model.ap, device=device)
+        style_mel = style_mel.transpose(1, 2)  # [1, time, depth]
+
+    language_name = None
+    if language_id is not None:
+        language = [k for k, v in model.language_manager.name_to_id.items() if v == language_id]
+        assert len(language) == 1, "language_id must be a valid language"
+        language_name = language[0]
+
+    # convert text to sequence of token IDs
+    text_inputs = np.asarray(
+        model.tokenizer.text_to_ids(text, language=language_name),
+        dtype=np.int32,
+    )
+    # pass tensors to backend
+    if speaker_id is not None:
+        speaker_id = id_to_torch(speaker_id, device=device)
+
+    if d_vector is not None:
+        d_vector = embedding_to_torch(d_vector, device=device)
+
+    if language_id is not None:
+        language_id = id_to_torch(language_id, device=device)
+
+    if not isinstance(style_mel, dict):
+        # GST or Capacitron style mel
+        style_mel = numpy_to_torch(style_mel, torch.float, device=device)
+        if style_text is not None:
+            style_text = np.asarray(
+                model.tokenizer.text_to_ids(style_text, language=language_id),
+                dtype=np.int32,
+            )
+            style_text = numpy_to_torch(style_text, torch.long, device=device)
+            style_text = style_text.unsqueeze(0)
+
+    text_inputs = numpy_to_torch(text_inputs, torch.long, device=device)
+    text_inputs = text_inputs.unsqueeze(0)
+    # synthesize voice
+    outputs = run_model_torch(
+        model,
+        text_inputs,
+        speaker_id,
+        style_mel,
+        style_text,
+        d_vector=d_vector,
+        language_id=language_id,
+    )
+    model_outputs = outputs["model_outputs"]
+    model_outputs = model_outputs[0].data.cpu().numpy()
+    alignments = outputs["alignments"]
+
+    # convert outputs to numpy
+    # plot results
+    wav = None
+    model_outputs = model_outputs.squeeze()
+    if model_outputs.ndim == 2:  # [T, C_spec]
+        if use_griffin_lim:
+            wav = inv_spectrogram(model_outputs, model.ap, CONFIG)
+            # trim silence
+            if do_trim_silence:
+                wav = trim_silence(wav, model.ap)
+    else:  # [T,]
+        wav = model_outputs
+    return_dict = {
+        "wav": wav,
+        "alignments": alignments,
+        "text_inputs": text_inputs,
+        "outputs": outputs,
+    }
+    return return_dict
+
+
+def transfer_voice(
+    model,
+    CONFIG,
+    use_cuda,
+    reference_wav,
+    speaker_id=None,
+    d_vector=None,
+    reference_speaker_id=None,
+    reference_d_vector=None,
+    do_trim_silence=False,
+    use_griffin_lim=False,
+):
+    """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to
+    the vocoder model.
+
+    Args:
+        model (TTS.tts.models):
+            The TTS model to synthesize audio with.
+
+        CONFIG (Coqpit):
+            Model configuration.
+
+        use_cuda (bool):
+            Enable/disable CUDA.
+
+        reference_wav (str):
+            Path of reference_wav to be used to voice conversion.
+
+        speaker_id (int):
+            Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.
+
+        d_vector (torch.Tensor):
+            d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None.
+
+        reference_speaker_id (int):
+            Reference Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.
+
+        reference_d_vector (torch.Tensor):
+            Reference d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None.
+
+        enable_eos_bos_chars (bool):
+            enable special chars for end of sentence and start of sentence. Defaults to False.
+
+        do_trim_silence (bool):
+            trim silence after synthesis. Defaults to False.
+    """
+    # device
+    device = next(model.parameters()).device
+    if use_cuda:
+        device = "cuda"
+
+    # pass tensors to backend
+    if speaker_id is not None:
+        speaker_id = id_to_torch(speaker_id, device=device)
+
+    if d_vector is not None:
+        d_vector = embedding_to_torch(d_vector, device=device)
+
+    if reference_d_vector is not None:
+        reference_d_vector = embedding_to_torch(reference_d_vector, device=device)
+
+    # load reference_wav audio
+    reference_wav = embedding_to_torch(
+        model.ap.load_wav(
+            reference_wav, sr=model.args.encoder_sample_rate if model.args.encoder_sample_rate else model.ap.sample_rate
+        ),
+        device=device,
+    )
+
+    if hasattr(model, "module"):
+        _func = model.module.inference_voice_conversion
+    else:
+        _func = model.inference_voice_conversion
+    model_outputs = _func(reference_wav, speaker_id, d_vector, reference_speaker_id, reference_d_vector)
+
+    # convert outputs to numpy
+    # plot results
+    wav = None
+    model_outputs = model_outputs.squeeze()
+    if model_outputs.ndim == 2:  # [T, C_spec]
+        if use_griffin_lim:
+            wav = inv_spectrogram(model_outputs, model.ap, CONFIG)
+            # trim silence
+            if do_trim_silence:
+                wav = trim_silence(wav, model.ap)
+    else:  # [T,]
+        wav = model_outputs
+
+    return wav
@@ -0,0 +1 @@
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
@@ -0,0 +1,121 @@
+import re
+
+import bangla
+from bnnumerizer import numerize
+from bnunicodenormalizer import Normalizer
+
+# initialize
+bnorm = Normalizer()
+
+
+attribution_dict = {
+    "সাঃ": "সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম",
+    "আঃ": "আলাইহিস সালাম",
+    "রাঃ": "রাদিআল্লাহু আনহু",
+    "রহঃ": "রহমাতুল্লাহি আলাইহি",
+    "রহিঃ": "রহিমাহুল্লাহ",
+    "হাফিঃ": "হাফিযাহুল্লাহ",
+    "বায়ান": "বাইআন",
+    "দাঃবাঃ": "দামাত বারাকাতুহুম,দামাত বারাকাতুল্লাহ",
+    # "আয়াত" : "আইআত",#আইআত
+    # "ওয়া" : "ওআ",
+    # "ওয়াসাল্লাম"  : "ওআসাল্লাম",
+    # "কেন"  : "কেনো",
+    # "কোন" : "কোনো",
+    # "বল"   : "বলো",
+    # "চল"   : "চলো",
+    # "কর"   : "করো",
+    # "রাখ"   : "রাখো",
+    "’": "",
+    "‘": "",
+    # "য়"     : "অ",
+    # "সম্প্রদায়" : "সম্প্রদাই",
+    # "রয়েছে"   : "রইছে",
+    # "রয়েছ"    : "রইছ",
+    "/": " বাই ",
+}
+
+
+def tag_text(text: str):
+    # remove multiple spaces
+    text = re.sub(" +", " ", text)
+    # create start and end
+    text = "start" + text + "end"
+    # tag text
+    parts = re.split("[\u0600-\u06FF]+", text)
+    # remove non chars
+    parts = [p for p in parts if p.strip()]
+    # unique parts
+    parts = set(parts)
+    # tag the text
+    for m in parts:
+        if len(m.strip()) > 1:
+            text = text.replace(m, f"{m}")
+    # clean-tags
+    text = text.replace("start", "")
+    text = text.replace("end", "")
+    return text
+
+
+def normalize(sen):
+    global bnorm  # pylint: disable=global-statement
+    _words = [bnorm(word)["normalized"] for word in sen.split()]
+    return " ".join([word for word in _words if word is not None])
+
+
+def expand_full_attribution(text):
+    for word, attr in attribution_dict.items():
+        if word in text:
+            text = text.replace(word, normalize(attr))
+    return text
+
+
+def collapse_whitespace(text):
+    # Regular expression matching whitespace:
+    _whitespace_re = re.compile(r"\s+")
+    return re.sub(_whitespace_re, " ", text)
+
+
+def bangla_text_to_phonemes(text: str) -> str:
+    # english numbers to bangla conversion
+    res = re.search("[0-9]", text)
+    if res is not None:
+        text = bangla.convert_english_digit_to_bangla_digit(text)
+
+    # replace ':' in between two bangla numbers with ' এর '
+    pattern = r"[০, ১, ২, ৩, ৪, ৫, ৬, ৭, ৮, ৯]:[০, ১, ২, ৩, ৪, ৫, ৬, ৭, ৮, ৯]"
+    matches = re.findall(pattern, text)
+    for m in matches:
+        r = m.replace(":", " এর ")
+        text = text.replace(m, r)
+
+    # numerize text
+    text = numerize(text)
+
+    # tag sections
+    text = tag_text(text)
+
+    # text blocks
+    # blocks = text.split("")
+    # blocks = [b for b in blocks if b.strip()]
+
+    # create tuple of (lang,text)
+    if "" in text:
+        text = text.replace("", "").replace("", "")
+    # Split based on sentence ending Characters
+    bn_text = text.strip()
+
+    sentenceEnders = re.compile("[।!?]")
+    sentences = sentenceEnders.split(str(bn_text))
+
+    data = ""
+    for sent in sentences:
+        res = re.sub("\n", "", sent)
+        res = normalize(res)
+        # expand attributes
+        res = expand_full_attribution(res)
+
+        res = collapse_whitespace(res)
+        res += "।"
+        data += res
+    return data
@@ -0,0 +1,37 @@
+import os
+
+finder = None
+
+
+def init():
+    try:
+        import jpype
+        import jpype.imports
+    except ModuleNotFoundError:
+        raise ModuleNotFoundError(
+            "Belarusian phonemizer requires to install module 'jpype1' manually. Try `pip install jpype1`."
+        )
+
+    try:
+        jar_path = os.environ["BEL_FANETYKA_JAR"]
+    except KeyError:
+        raise KeyError("You need to define 'BEL_FANETYKA_JAR' environment variable as path to the fanetyka.jar file")
+
+    jpype.startJVM(classpath=[jar_path])
+
+    # import the Java modules
+    from org.alex73.korpus.base import GrammarDB2, GrammarFinder
+
+    grammar_db = GrammarDB2.initializeFromJar()
+    global finder
+    finder = GrammarFinder(grammar_db)
+
+
+def belarusian_text_to_phonemes(text: str) -> str:
+    # Initialize only on first run
+    if finder is None:
+        init()
+
+    from org.alex73.fanetyka.impl import FanetykaText
+
+    return str(FanetykaText(finder, text).ipa)
@@ -0,0 +1,501 @@
+from dataclasses import replace
+from typing import Dict
+
+from TTS.tts.configs.shared_configs import CharactersConfig
+
+
+def parse_symbols():
+    return {
+        "pad": _pad,
+        "eos": _eos,
+        "bos": _bos,
+        "characters": _characters,
+        "punctuations": _punctuations,
+        "phonemes": _phonemes,
+    }
+
+
+# DEFAULT SET OF GRAPHEMES
+_pad = "<PAD>"
+_eos = "<EOS>"
+_bos = "<BOS>"
+_blank = "<BLNK>"  # TODO: check if we need this alongside with PAD
+_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+_punctuations = "!'(),-.:;? "
+
+
+# DEFAULT SET OF IPA PHONEMES
+# Phonemes definition (All IPA characters)
+_vowels = "iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ"
+_non_pulmonic_consonants = "ʘɓǀɗǃʄǂɠǁʛ"
+_pulmonic_consonants = "pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ"
+_suprasegmentals = "ˈˌːˑ"
+_other_symbols = "ʍwɥʜʢʡɕʑɺɧʲ"
+_diacrilics = "ɚ˞ɫ"
+_phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics
+
+
+class BaseVocabulary:
+    """Base Vocabulary class.
+
+    This class only needs a vocabulary dictionary without specifying the characters.
+
+    Args:
+        vocab (Dict): A dictionary of characters and their corresponding indices.
+    """
+
+    def __init__(self, vocab: Dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None):
+        self.vocab = vocab
+        self.pad = pad
+        self.blank = blank
+        self.bos = bos
+        self.eos = eos
+
+    @property
+    def pad_id(self) -> int:
+        """Return the index of the padding character. If the padding character is not specified, return the length
+        of the vocabulary."""
+        return self.char_to_id(self.pad) if self.pad else len(self.vocab)
+
+    @property
+    def blank_id(self) -> int:
+        """Return the index of the blank character. If the blank character is not specified, return the length of
+        the vocabulary."""
+        return self.char_to_id(self.blank) if self.blank else len(self.vocab)
+
+    @property
+    def bos_id(self) -> int:
+        """Return the index of the bos character. If the bos character is not specified, return the length of the
+        vocabulary."""
+        return self.char_to_id(self.bos) if self.bos else len(self.vocab)
+
+    @property
+    def eos_id(self) -> int:
+        """Return the index of the eos character. If the eos character is not specified, return the length of the
+        vocabulary."""
+        return self.char_to_id(self.eos) if self.eos else len(self.vocab)
+
+    @property
+    def vocab(self):
+        """Return the vocabulary dictionary."""
+        return self._vocab
+
+    @vocab.setter
+    def vocab(self, vocab):
+        """Set the vocabulary dictionary and character mapping dictionaries."""
+        self._vocab, self._char_to_id, self._id_to_char = None, None, None
+        if vocab is not None:
+            self._vocab = vocab
+            self._char_to_id = {char: idx for idx, char in enumerate(self._vocab)}
+            self._id_to_char = {
+                idx: char for idx, char in enumerate(self._vocab)  # pylint: disable=unnecessary-comprehension
+            }
+
+    @staticmethod
+    def init_from_config(config, **kwargs):
+        """Initialize from the given config."""
+        if config.characters is not None and "vocab_dict" in config.characters and config.characters.vocab_dict:
+            return (
+                BaseVocabulary(
+                    config.characters.vocab_dict,
+                    config.characters.pad,
+                    config.characters.blank,
+                    config.characters.bos,
+                    config.characters.eos,
+                ),
+                config,
+            )
+        return BaseVocabulary(**kwargs), config
+
+    def to_config(self) -> "CharactersConfig":
+        return CharactersConfig(
+            vocab_dict=self._vocab,
+            pad=self.pad,
+            eos=self.eos,
+            bos=self.bos,
+            blank=self.blank,
+            is_unique=False,
+            is_sorted=False,
+        )
+
+    @property
+    def num_chars(self):
+        """Return number of tokens in the vocabulary."""
+        return len(self._vocab)
+
+    def char_to_id(self, char: str) -> int:
+        """Map a character to an token ID."""
+        try:
+            return self._char_to_id[char]
+        except KeyError as e:
+            raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
+
+    def id_to_char(self, idx: int) -> str:
+        """Map an token ID to a character."""
+        return self._id_to_char[idx]
+
+
+class BaseCharacters:
+    """🐸BaseCharacters class
+
+        Every new character class should inherit from this.
+
+        Characters are oredered as follows ```[PAD, EOS, BOS, BLANK, CHARACTERS, PUNCTUATIONS]```.
+
+        If you need a custom order, you need to define inherit from this class and override the ```_create_vocab``` method.
+
+        Args:
+            characters (str):
+                Main set of characters to be used in the vocabulary.
+
+            punctuations (str):
+                Characters to be treated as punctuation.
+
+            pad (str):
+                Special padding character that would be ignored by the model.
+
+            eos (str):
+                End of the sentence character.
+
+            bos (str):
+                Beginning of the sentence character.
+
+            blank (str):
+                Optional character used between characters by some models for better prosody.
+
+            is_unique (bool):
+                Remove duplicates from the provided characters. Defaults to True.
+    el
+            is_sorted (bool):
+                Sort the characters in alphabetical order. Only applies to `self.characters`. Defaults to True.
+    """
+
+    def __init__(
+        self,
+        characters: str = None,
+        punctuations: str = None,
+        pad: str = None,
+        eos: str = None,
+        bos: str = None,
+        blank: str = None,
+        is_unique: bool = False,
+        is_sorted: bool = True,
+    ) -> None:
+        self._characters = characters
+        self._punctuations = punctuations
+        self._pad = pad
+        self._eos = eos
+        self._bos = bos
+        self._blank = blank
+        self.is_unique = is_unique
+        self.is_sorted = is_sorted
+        self._create_vocab()
+
+    @property
+    def pad_id(self) -> int:
+        return self.char_to_id(self.pad) if self.pad else len(self.vocab)
+
+    @property
+    def blank_id(self) -> int:
+        return self.char_to_id(self.blank) if self.blank else len(self.vocab)
+
+    @property
+    def eos_id(self) -> int:
+        return self.char_to_id(self.eos) if self.eos else len(self.vocab)
+
+    @property
+    def bos_id(self) -> int:
+        return self.char_to_id(self.bos) if self.bos else len(self.vocab)
+
+    @property
+    def characters(self):
+        return self._characters
+
+    @characters.setter
+    def characters(self, characters):
+        self._characters = characters
+        self._create_vocab()
+
+    @property
+    def punctuations(self):
+        return self._punctuations
+
+    @punctuations.setter
+    def punctuations(self, punctuations):
+        self._punctuations = punctuations
+        self._create_vocab()
+
+    @property
+    def pad(self):
+        return self._pad
+
+    @pad.setter
+    def pad(self, pad):
+        self._pad = pad
+        self._create_vocab()
+
+    @property
+    def eos(self):
+        return self._eos
+
+    @eos.setter
+    def eos(self, eos):
+        self._eos = eos
+        self._create_vocab()
+
+    @property
+    def bos(self):
+        return self._bos
+
+    @bos.setter
+    def bos(self, bos):
+        self._bos = bos
+        self._create_vocab()
+
+    @property
+    def blank(self):
+        return self._blank
+
+    @blank.setter
+    def blank(self, blank):
+        self._blank = blank
+        self._create_vocab()
+
+    @property
+    def vocab(self):
+        return self._vocab
+
+    @vocab.setter
+    def vocab(self, vocab):
+        self._vocab = vocab
+        self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
+        self._id_to_char = {
+            idx: char for idx, char in enumerate(self.vocab)  # pylint: disable=unnecessary-comprehension
+        }
+
+    @property
+    def num_chars(self):
+        return len(self._vocab)
+
+    def _create_vocab(self):
+        _vocab = self._characters
+        if self.is_unique:
+            _vocab = list(set(_vocab))
+        if self.is_sorted:
+            _vocab = sorted(_vocab)
+        _vocab = list(_vocab)
+        _vocab = [self._blank] + _vocab if self._blank is not None and len(self._blank) > 0 else _vocab
+        _vocab = [self._bos] + _vocab if self._bos is not None and len(self._bos) > 0 else _vocab
+        _vocab = [self._eos] + _vocab if self._eos is not None and len(self._eos) > 0 else _vocab
+        _vocab = [self._pad] + _vocab if self._pad is not None and len(self._pad) > 0 else _vocab
+        self.vocab = _vocab + list(self._punctuations)
+        if self.is_unique:
+            duplicates = {x for x in self.vocab if self.vocab.count(x) > 1}
+            assert (
+                len(self.vocab) == len(self._char_to_id) == len(self._id_to_char)
+            ), f" [!] There are duplicate characters in the character set. {duplicates}"
+
+    def char_to_id(self, char: str) -> int:
+        try:
+            return self._char_to_id[char]
+        except KeyError as e:
+            raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
+
+    def id_to_char(self, idx: int) -> str:
+        return self._id_to_char[idx]
+
+    def print_log(self, level: int = 0):
+        """
+        Prints the vocabulary in a nice format.
+        """
+        indent = "\t" * level
+        print(f"{indent}| > Characters: {self._characters}")
+        print(f"{indent}| > Punctuations: {self._punctuations}")
+        print(f"{indent}| > Pad: {self._pad}")
+        print(f"{indent}| > EOS: {self._eos}")
+        print(f"{indent}| > BOS: {self._bos}")
+        print(f"{indent}| > Blank: {self._blank}")
+        print(f"{indent}| > Vocab: {self.vocab}")
+        print(f"{indent}| > Num chars: {self.num_chars}")
+
+    @staticmethod
+    def init_from_config(config: "Coqpit"):  # pylint: disable=unused-argument
+        """Init your character class from a config.
+
+        Implement this method for your subclass.
+        """
+        # use character set from config
+        if config.characters is not None:
+            return BaseCharacters(**config.characters), config
+        # return default character set
+        characters = BaseCharacters()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
+
+    def to_config(self) -> "CharactersConfig":
+        return CharactersConfig(
+            characters=self._characters,
+            punctuations=self._punctuations,
+            pad=self._pad,
+            eos=self._eos,
+            bos=self._bos,
+            blank=self._blank,
+            is_unique=self.is_unique,
+            is_sorted=self.is_sorted,
+        )
+
+
+class IPAPhonemes(BaseCharacters):
+    """🐸IPAPhonemes class to manage `TTS.tts` model vocabulary
+
+    Intended to be used with models using IPAPhonemes as input.
+    It uses system defaults for the undefined class arguments.
+
+    Args:
+        characters (str):
+            Main set of case-sensitive characters to be used in the vocabulary. Defaults to `_phonemes`.
+
+        punctuations (str):
+            Characters to be treated as punctuation. Defaults to `_punctuations`.
+
+        pad (str):
+            Special padding character that would be ignored by the model. Defaults to `_pad`.
+
+        eos (str):
+            End of the sentence character. Defaults to `_eos`.
+
+        bos (str):
+            Beginning of the sentence character. Defaults to `_bos`.
+
+        blank (str):
+            Optional character used between characters by some models for better prosody. Defaults to `_blank`.
+
+        is_unique (bool):
+            Remove duplicates from the provided characters. Defaults to True.
+
+        is_sorted (bool):
+            Sort the characters in alphabetical order. Defaults to True.
+    """
+
+    def __init__(
+        self,
+        characters: str = _phonemes,
+        punctuations: str = _punctuations,
+        pad: str = _pad,
+        eos: str = _eos,
+        bos: str = _bos,
+        blank: str = _blank,
+        is_unique: bool = False,
+        is_sorted: bool = True,
+    ) -> None:
+        super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
+
+    @staticmethod
+    def init_from_config(config: "Coqpit"):
+        """Init a IPAPhonemes object from a model config
+
+        If characters are not defined in the config, it will be set to the default characters and the config
+        will be updated.
+        """
+        # band-aid for compatibility with old models
+        if "characters" in config and config.characters is not None:
+            if "phonemes" in config.characters and config.characters.phonemes is not None:
+                config.characters["characters"] = config.characters["phonemes"]
+            return (
+                IPAPhonemes(
+                    characters=config.characters["characters"],
+                    punctuations=config.characters["punctuations"],
+                    pad=config.characters["pad"],
+                    eos=config.characters["eos"],
+                    bos=config.characters["bos"],
+                    blank=config.characters["blank"],
+                    is_unique=config.characters["is_unique"],
+                    is_sorted=config.characters["is_sorted"],
+                ),
+                config,
+            )
+        # use character set from config
+        if config.characters is not None:
+            return IPAPhonemes(**config.characters), config
+        # return default character set
+        characters = IPAPhonemes()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
+
+
+class Graphemes(BaseCharacters):
+    """🐸Graphemes class to manage `TTS.tts` model vocabulary
+
+    Intended to be used with models using graphemes as input.
+    It uses system defaults for the undefined class arguments.
+
+    Args:
+        characters (str):
+            Main set of case-sensitive characters to be used in the vocabulary. Defaults to `_characters`.
+
+        punctuations (str):
+            Characters to be treated as punctuation. Defaults to `_punctuations`.
+
+        pad (str):
+            Special padding character that would be ignored by the model. Defaults to `_pad`.
+
+        eos (str):
+            End of the sentence character. Defaults to `_eos`.
+
+        bos (str):
+            Beginning of the sentence character. Defaults to `_bos`.
+
+        is_unique (bool):
+            Remove duplicates from the provided characters. Defaults to True.
+
+        is_sorted (bool):
+            Sort the characters in alphabetical order. Defaults to True.
+    """
+
+    def __init__(
+        self,
+        characters: str = _characters,
+        punctuations: str = _punctuations,
+        pad: str = _pad,
+        eos: str = _eos,
+        bos: str = _bos,
+        blank: str = _blank,
+        is_unique: bool = False,
+        is_sorted: bool = True,
+    ) -> None:
+        super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
+
+    @staticmethod
+    def init_from_config(config: "Coqpit"):
+        """Init a Graphemes object from a model config
+
+        If characters are not defined in the config, it will be set to the default characters and the config
+        will be updated.
+        """
+        if config.characters is not None:
+            # band-aid for compatibility with old models
+            if "phonemes" in config.characters:
+                return (
+                    Graphemes(
+                        characters=config.characters["characters"],
+                        punctuations=config.characters["punctuations"],
+                        pad=config.characters["pad"],
+                        eos=config.characters["eos"],
+                        bos=config.characters["bos"],
+                        blank=config.characters["blank"],
+                        is_unique=config.characters["is_unique"],
+                        is_sorted=config.characters["is_sorted"],
+                    ),
+                    config,
+                )
+            return Graphemes(**config.characters), config
+        characters = Graphemes()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
+
+
+if __name__ == "__main__":
+    gr = Graphemes()
+    ph = IPAPhonemes()
+    gr.print_log()
+    ph.print_log()
@@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Licensed under WTFPL or the Unlicense or CC0.
+# This uses Python 3, but it's easy to port to Python 2 by changing
+# strings to u'xx'.
+
+import itertools
+import re
+
+
+def _num2chinese(num: str, big=False, simp=True, o=False, twoalt=False) -> str:
+    """Convert numerical arabic numbers (0->9) to chinese hanzi numbers (〇 -> 九)
+
+    Args:
+        num (str): arabic number to convert
+        big (bool, optional): use financial characters. Defaults to False.
+        simp (bool, optional): use simplified characters instead of tradictional characters. Defaults to True.
+        o (bool, optional): use 〇 for 'zero'. Defaults to False.
+        twoalt (bool, optional): use 两/兩 for 'two' when appropriate. Defaults to False.
+
+    Raises:
+        ValueError: if number is more than 1e48
+        ValueError: if 'e' exposent in number
+
+    Returns:
+        str: converted number as hanzi characters
+    """
+
+    # check num first
+    nd = str(num)
+    if abs(float(nd)) >= 1e48:
+        raise ValueError("number out of range")
+    if "e" in nd:
+        raise ValueError("scientific notation is not supported")
+    c_symbol = "正负点" if simp else "正負點"
+    if o:  # formal
+        twoalt = False
+    if big:
+        c_basic = "零壹贰叁肆伍陆柒捌玖" if simp else "零壹貳參肆伍陸柒捌玖"
+        c_unit1 = "拾佰仟"
+        c_twoalt = "贰" if simp else "貳"
+    else:
+        c_basic = "〇一二三四五六七八九" if o else "零一二三四五六七八九"
+        c_unit1 = "十百千"
+        if twoalt:
+            c_twoalt = "两" if simp else "兩"
+        else:
+            c_twoalt = "二"
+    c_unit2 = "万亿兆京垓秭穰沟涧正载" if simp else "萬億兆京垓秭穰溝澗正載"
+    revuniq = lambda l: "".join(k for k, g in itertools.groupby(reversed(l)))
+    nd = str(num)
+    result = []
+    if nd[0] == "+":
+        result.append(c_symbol[0])
+    elif nd[0] == "-":
+        result.append(c_symbol[1])
+    if "." in nd:
+        integer, remainder = nd.lstrip("+-").split(".")
+    else:
+        integer, remainder = nd.lstrip("+-"), None
+    if int(integer):
+        splitted = [integer[max(i - 4, 0) : i] for i in range(len(integer), 0, -4)]
+        intresult = []
+        for nu, unit in enumerate(splitted):
+            # special cases
+            if int(unit) == 0:  # 0000
+                intresult.append(c_basic[0])
+                continue
+            if nu > 0 and int(unit) == 2:  # 0002
+                intresult.append(c_twoalt + c_unit2[nu - 1])
+                continue
+            ulist = []
+            unit = unit.zfill(4)
+            for nc, ch in enumerate(reversed(unit)):
+                if ch == "0":
+                    if ulist:  # ???0
+                        ulist.append(c_basic[0])
+                elif nc == 0:
+                    ulist.append(c_basic[int(ch)])
+                elif nc == 1 and ch == "1" and unit[1] == "0":
+                    # special case for tens
+                    # edit the 'elif' if you don't like
+                    # 十四, 三千零十四, 三千三百一十四
+                    ulist.append(c_unit1[0])
+                elif nc > 1 and ch == "2":
+                    ulist.append(c_twoalt + c_unit1[nc - 1])
+                else:
+                    ulist.append(c_basic[int(ch)] + c_unit1[nc - 1])
+            ustr = revuniq(ulist)
+            if nu == 0:
+                intresult.append(ustr)
+            else:
+                intresult.append(ustr + c_unit2[nu - 1])
+        result.append(revuniq(intresult).strip(c_basic[0]))
+    else:
+        result.append(c_basic[0])
+    if remainder:
+        result.append(c_symbol[2])
+        result.append("".join(c_basic[int(ch)] for ch in remainder))
+    return "".join(result)
+
+
+def _number_replace(match) -> str:
+    """function to apply in a match, transform all numbers in a match by chinese characters
+
+    Args:
+        match (re.Match): numbers regex matches
+
+    Returns:
+        str: replaced characters for the numbers
+    """
+    match_str: str = match.group()
+    return _num2chinese(match_str)
+
+
+def replace_numbers_to_characters_in_text(text: str) -> str:
+    """Replace all arabic numbers in a text by their equivalent in chinese characters (simplified)
+
+    Args:
+        text (str): input text to transform
+
+    Returns:
+        str: output text
+    """
+    text = re.sub(r"[0-9]+", _number_replace, text)
+    return text
@@ -0,0 +1,37 @@
+from typing import List
+
+import jieba
+import pypinyin
+
+from .pinyinToPhonemes import PINYIN_DICT
+
+
+def _chinese_character_to_pinyin(text: str) -> List[str]:
+    pinyins = pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)
+    pinyins_flat_list = [item for sublist in pinyins for item in sublist]
+    return pinyins_flat_list
+
+
+def _chinese_pinyin_to_phoneme(pinyin: str) -> str:
+    segment = pinyin[:-1]
+    tone = pinyin[-1]
+    phoneme = PINYIN_DICT.get(segment, [""])[0]
+    return phoneme + tone
+
+
+def chinese_text_to_phonemes(text: str, seperator: str = "|") -> str:
+    tokenized_text = jieba.cut(text, HMM=False)
+    tokenized_text = " ".join(tokenized_text)
+    pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text)
+
+    results: List[str] = []
+
+    for token in pinyined_text:
+        if token[-1] in "12345":  # TODO transform to is_pinyin()
+            pinyin_phonemes = _chinese_pinyin_to_phoneme(token)
+
+            results += list(pinyin_phonemes)
+        else:  # is ponctuation or other
+            results += list(token)
+
+    return seperator.join(results)
@@ -0,0 +1,419 @@
+PINYIN_DICT = {
+    "a": ["a"],
+    "ai": ["ai"],
+    "an": ["an"],
+    "ang": ["ɑŋ"],
+    "ao": ["aʌ"],
+    "ba": ["ba"],
+    "bai": ["bai"],
+    "ban": ["ban"],
+    "bang": ["bɑŋ"],
+    "bao": ["baʌ"],
+    # "be": ["be"], doesnt exist
+    "bei": ["bɛi"],
+    "ben": ["bœn"],
+    "beng": ["bɵŋ"],
+    "bi": ["bi"],
+    "bian": ["biɛn"],
+    "biao": ["biaʌ"],
+    "bie": ["bie"],
+    "bin": ["bin"],
+    "bing": ["bɨŋ"],
+    "bo": ["bo"],
+    "bu": ["bu"],
+    "ca": ["tsa"],
+    "cai": ["tsai"],
+    "can": ["tsan"],
+    "cang": ["tsɑŋ"],
+    "cao": ["tsaʌ"],
+    "ce": ["tsø"],
+    "cen": ["tsœn"],
+    "ceng": ["tsɵŋ"],
+    "cha": ["ʈʂa"],
+    "chai": ["ʈʂai"],
+    "chan": ["ʈʂan"],
+    "chang": ["ʈʂɑŋ"],
+    "chao": ["ʈʂaʌ"],
+    "che": ["ʈʂø"],
+    "chen": ["ʈʂœn"],
+    "cheng": ["ʈʂɵŋ"],
+    "chi": ["ʈʂʏ"],
+    "chong": ["ʈʂoŋ"],
+    "chou": ["ʈʂou"],
+    "chu": ["ʈʂu"],
+    "chua": ["ʈʂua"],
+    "chuai": ["ʈʂuai"],
+    "chuan": ["ʈʂuan"],
+    "chuang": ["ʈʂuɑŋ"],
+    "chui": ["ʈʂuei"],
+    "chun": ["ʈʂun"],
+    "chuo": ["ʈʂuo"],
+    "ci": ["tsɪ"],
+    "cong": ["tsoŋ"],
+    "cou": ["tsou"],
+    "cu": ["tsu"],
+    "cuan": ["tsuan"],
+    "cui": ["tsuei"],
+    "cun": ["tsun"],
+    "cuo": ["tsuo"],
+    "da": ["da"],
+    "dai": ["dai"],
+    "dan": ["dan"],
+    "dang": ["dɑŋ"],
+    "dao": ["daʌ"],
+    "de": ["dø"],
+    "dei": ["dei"],
+    # "den": ["dœn"],
+    "deng": ["dɵŋ"],
+    "di": ["di"],
+    "dia": ["dia"],
+    "dian": ["diɛn"],
+    "diao": ["diaʌ"],
+    "die": ["die"],
+    "ding": ["dɨŋ"],
+    "diu": ["dio"],
+    "dong": ["doŋ"],
+    "dou": ["dou"],
+    "du": ["du"],
+    "duan": ["duan"],
+    "dui": ["duei"],
+    "dun": ["dun"],
+    "duo": ["duo"],
+    "e": ["ø"],
+    "ei": ["ei"],
+    "en": ["œn"],
+    # "ng": ["œn"],
+    # "eng": ["ɵŋ"],
+    "er": ["er"],
+    "fa": ["fa"],
+    "fan": ["fan"],
+    "fang": ["fɑŋ"],
+    "fei": ["fei"],
+    "fen": ["fœn"],
+    "feng": ["fɵŋ"],
+    "fo": ["fo"],
+    "fou": ["fou"],
+    "fu": ["fu"],
+    "ga": ["ga"],
+    "gai": ["gai"],
+    "gan": ["gan"],
+    "gang": ["gɑŋ"],
+    "gao": ["gaʌ"],
+    "ge": ["gø"],
+    "gei": ["gei"],
+    "gen": ["gœn"],
+    "geng": ["gɵŋ"],
+    "gong": ["goŋ"],
+    "gou": ["gou"],
+    "gu": ["gu"],
+    "gua": ["gua"],
+    "guai": ["guai"],
+    "guan": ["guan"],
+    "guang": ["guɑŋ"],
+    "gui": ["guei"],
+    "gun": ["gun"],
+    "guo": ["guo"],
+    "ha": ["xa"],
+    "hai": ["xai"],
+    "han": ["xan"],
+    "hang": ["xɑŋ"],
+    "hao": ["xaʌ"],
+    "he": ["xø"],
+    "hei": ["xei"],
+    "hen": ["xœn"],
+    "heng": ["xɵŋ"],
+    "hong": ["xoŋ"],
+    "hou": ["xou"],
+    "hu": ["xu"],
+    "hua": ["xua"],
+    "huai": ["xuai"],
+    "huan": ["xuan"],
+    "huang": ["xuɑŋ"],
+    "hui": ["xuei"],
+    "hun": ["xun"],
+    "huo": ["xuo"],
+    "ji": ["dʑi"],
+    "jia": ["dʑia"],
+    "jian": ["dʑiɛn"],
+    "jiang": ["dʑiɑŋ"],
+    "jiao": ["dʑiaʌ"],
+    "jie": ["dʑie"],
+    "jin": ["dʑin"],
+    "jing": ["dʑɨŋ"],
+    "jiong": ["dʑioŋ"],
+    "jiu": ["dʑio"],
+    "ju": ["dʑy"],
+    "juan": ["dʑyɛn"],
+    "jue": ["dʑye"],
+    "jun": ["dʑyn"],
+    "ka": ["ka"],
+    "kai": ["kai"],
+    "kan": ["kan"],
+    "kang": ["kɑŋ"],
+    "kao": ["kaʌ"],
+    "ke": ["kø"],
+    "kei": ["kei"],
+    "ken": ["kœn"],
+    "keng": ["kɵŋ"],
+    "kong": ["koŋ"],
+    "kou": ["kou"],
+    "ku": ["ku"],
+    "kua": ["kua"],
+    "kuai": ["kuai"],
+    "kuan": ["kuan"],
+    "kuang": ["kuɑŋ"],
+    "kui": ["kuei"],
+    "kun": ["kun"],
+    "kuo": ["kuo"],
+    "la": ["la"],
+    "lai": ["lai"],
+    "lan": ["lan"],
+    "lang": ["lɑŋ"],
+    "lao": ["laʌ"],
+    "le": ["lø"],
+    "lei": ["lei"],
+    "leng": ["lɵŋ"],
+    "li": ["li"],
+    "lia": ["lia"],
+    "lian": ["liɛn"],
+    "liang": ["liɑŋ"],
+    "liao": ["liaʌ"],
+    "lie": ["lie"],
+    "lin": ["lin"],
+    "ling": ["lɨŋ"],
+    "liu": ["lio"],
+    "lo": ["lo"],
+    "long": ["loŋ"],
+    "lou": ["lou"],
+    "lu": ["lu"],
+    "lv": ["ly"],
+    "luan": ["luan"],
+    "lve": ["lye"],
+    "lue": ["lue"],
+    "lun": ["lun"],
+    "luo": ["luo"],
+    "ma": ["ma"],
+    "mai": ["mai"],
+    "man": ["man"],
+    "mang": ["mɑŋ"],
+    "mao": ["maʌ"],
+    "me": ["mø"],
+    "mei": ["mei"],
+    "men": ["mœn"],
+    "meng": ["mɵŋ"],
+    "mi": ["mi"],
+    "mian": ["miɛn"],
+    "miao": ["miaʌ"],
+    "mie": ["mie"],
+    "min": ["min"],
+    "ming": ["mɨŋ"],
+    "miu": ["mio"],
+    "mo": ["mo"],
+    "mou": ["mou"],
+    "mu": ["mu"],
+    "na": ["na"],
+    "nai": ["nai"],
+    "nan": ["nan"],
+    "nang": ["nɑŋ"],
+    "nao": ["naʌ"],
+    "ne": ["nø"],
+    "nei": ["nei"],
+    "nen": ["nœn"],
+    "neng": ["nɵŋ"],
+    "ni": ["ni"],
+    "nia": ["nia"],
+    "nian": ["niɛn"],
+    "niang": ["niɑŋ"],
+    "niao": ["niaʌ"],
+    "nie": ["nie"],
+    "nin": ["nin"],
+    "ning": ["nɨŋ"],
+    "niu": ["nio"],
+    "nong": ["noŋ"],
+    "nou": ["nou"],
+    "nu": ["nu"],
+    "nv": ["ny"],
+    "nuan": ["nuan"],
+    "nve": ["nye"],
+    "nue": ["nye"],
+    "nuo": ["nuo"],
+    "o": ["o"],
+    "ou": ["ou"],
+    "pa": ["pa"],
+    "pai": ["pai"],
+    "pan": ["pan"],
+    "pang": ["pɑŋ"],
+    "pao": ["paʌ"],
+    "pe": ["pø"],
+    "pei": ["pei"],
+    "pen": ["pœn"],
+    "peng": ["pɵŋ"],
+    "pi": ["pi"],
+    "pian": ["piɛn"],
+    "piao": ["piaʌ"],
+    "pie": ["pie"],
+    "pin": ["pin"],
+    "ping": ["pɨŋ"],
+    "po": ["po"],
+    "pou": ["pou"],
+    "pu": ["pu"],
+    "qi": ["tɕi"],
+    "qia": ["tɕia"],
+    "qian": ["tɕiɛn"],
+    "qiang": ["tɕiɑŋ"],
+    "qiao": ["tɕiaʌ"],
+    "qie": ["tɕie"],
+    "qin": ["tɕin"],
+    "qing": ["tɕɨŋ"],
+    "qiong": ["tɕioŋ"],
+    "qiu": ["tɕio"],
+    "qu": ["tɕy"],
+    "quan": ["tɕyɛn"],
+    "que": ["tɕye"],
+    "qun": ["tɕyn"],
+    "ran": ["ʐan"],
+    "rang": ["ʐɑŋ"],
+    "rao": ["ʐaʌ"],
+    "re": ["ʐø"],
+    "ren": ["ʐœn"],
+    "reng": ["ʐɵŋ"],
+    "ri": ["ʐʏ"],
+    "rong": ["ʐoŋ"],
+    "rou": ["ʐou"],
+    "ru": ["ʐu"],
+    "rua": ["ʐua"],
+    "ruan": ["ʐuan"],
+    "rui": ["ʐuei"],
+    "run": ["ʐun"],
+    "ruo": ["ʐuo"],
+    "sa": ["sa"],
+    "sai": ["sai"],
+    "san": ["san"],
+    "sang": ["sɑŋ"],
+    "sao": ["saʌ"],
+    "se": ["sø"],
+    "sen": ["sœn"],
+    "seng": ["sɵŋ"],
+    "sha": ["ʂa"],
+    "shai": ["ʂai"],
+    "shan": ["ʂan"],
+    "shang": ["ʂɑŋ"],
+    "shao": ["ʂaʌ"],
+    "she": ["ʂø"],
+    "shei": ["ʂei"],
+    "shen": ["ʂœn"],
+    "sheng": ["ʂɵŋ"],
+    "shi": ["ʂʏ"],
+    "shou": ["ʂou"],
+    "shu": ["ʂu"],
+    "shua": ["ʂua"],
+    "shuai": ["ʂuai"],
+    "shuan": ["ʂuan"],
+    "shuang": ["ʂuɑŋ"],
+    "shui": ["ʂuei"],
+    "shun": ["ʂun"],
+    "shuo": ["ʂuo"],
+    "si": ["sɪ"],
+    "song": ["soŋ"],
+    "sou": ["sou"],
+    "su": ["su"],
+    "suan": ["suan"],
+    "sui": ["suei"],
+    "sun": ["sun"],
+    "suo": ["suo"],
+    "ta": ["ta"],
+    "tai": ["tai"],
+    "tan": ["tan"],
+    "tang": ["tɑŋ"],
+    "tao": ["taʌ"],
+    "te": ["tø"],
+    "tei": ["tei"],
+    "teng": ["tɵŋ"],
+    "ti": ["ti"],
+    "tian": ["tiɛn"],
+    "tiao": ["tiaʌ"],
+    "tie": ["tie"],
+    "ting": ["tɨŋ"],
+    "tong": ["toŋ"],
+    "tou": ["tou"],
+    "tu": ["tu"],
+    "tuan": ["tuan"],
+    "tui": ["tuei"],
+    "tun": ["tun"],
+    "tuo": ["tuo"],
+    "wa": ["wa"],
+    "wai": ["wai"],
+    "wan": ["wan"],
+    "wang": ["wɑŋ"],
+    "wei": ["wei"],
+    "wen": ["wœn"],
+    "weng": ["wɵŋ"],
+    "wo": ["wo"],
+    "wu": ["wu"],
+    "xi": ["ɕi"],
+    "xia": ["ɕia"],
+    "xian": ["ɕiɛn"],
+    "xiang": ["ɕiɑŋ"],
+    "xiao": ["ɕiaʌ"],
+    "xie": ["ɕie"],
+    "xin": ["ɕin"],
+    "xing": ["ɕɨŋ"],
+    "xiong": ["ɕioŋ"],
+    "xiu": ["ɕio"],
+    "xu": ["ɕy"],
+    "xuan": ["ɕyɛn"],
+    "xue": ["ɕye"],
+    "xun": ["ɕyn"],
+    "ya": ["ia"],
+    "yan": ["iɛn"],
+    "yang": ["iɑŋ"],
+    "yao": ["iaʌ"],
+    "ye": ["ie"],
+    "yi": ["i"],
+    "yin": ["in"],
+    "ying": ["ɨŋ"],
+    "yo": ["io"],
+    "yong": ["ioŋ"],
+    "you": ["io"],
+    "yu": ["y"],
+    "yuan": ["yɛn"],
+    "yue": ["ye"],
+    "yun": ["yn"],
+    "za": ["dza"],
+    "zai": ["dzai"],
+    "zan": ["dzan"],
+    "zang": ["dzɑŋ"],
+    "zao": ["dzaʌ"],
+    "ze": ["dzø"],
+    "zei": ["dzei"],
+    "zen": ["dzœn"],
+    "zeng": ["dzɵŋ"],
+    "zha": ["dʒa"],
+    "zhai": ["dʒai"],
+    "zhan": ["dʒan"],
+    "zhang": ["dʒɑŋ"],
+    "zhao": ["dʒaʌ"],
+    "zhe": ["dʒø"],
+    # "zhei": ["dʒei"], it doesn't exist
+    "zhen": ["dʒœn"],
+    "zheng": ["dʒɵŋ"],
+    "zhi": ["dʒʏ"],
+    "zhong": ["dʒoŋ"],
+    "zhou": ["dʒou"],
+    "zhu": ["dʒu"],
+    "zhua": ["dʒua"],
+    "zhuai": ["dʒuai"],
+    "zhuan": ["dʒuan"],
+    "zhuang": ["dʒuɑŋ"],
+    "zhui": ["dʒuei"],
+    "zhun": ["dʒun"],
+    "zhuo": ["dʒuo"],
+    "zi": ["dzɪ"],
+    "zong": ["dzoŋ"],
+    "zou": ["dzou"],
+    "zu": ["dzu"],
+    "zuan": ["dzuan"],
+    "zui": ["dzuei"],
+    "zun": ["dzun"],
+    "zuo": ["dzuo"],
+}
@@ -0,0 +1,171 @@
+"""Set of default text cleaners"""
+# TODO: pick the cleaner for languages dynamically
+
+import re
+
+from anyascii import anyascii
+
+from TTS.tts.utils.text.chinese_mandarin.numbers import replace_numbers_to_characters_in_text
+
+from .english.abbreviations import abbreviations_en
+from .english.number_norm import normalize_numbers as en_normalize_numbers
+from .english.time_norm import expand_time_english
+from .french.abbreviations import abbreviations_fr
+
+# Regular expression matching whitespace:
+_whitespace_re = re.compile(r"\s+")
+
+
+def expand_abbreviations(text, lang="en"):
+    if lang == "en":
+        _abbreviations = abbreviations_en
+    elif lang == "fr":
+        _abbreviations = abbreviations_fr
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def lowercase(text):
+    return text.lower()
+
+
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, " ", text).strip()
+
+
+def convert_to_ascii(text):
+    return anyascii(text)
+
+
+def remove_aux_symbols(text):
+    text = re.sub(r"[\<\>\(\)\[\]\"]+", "", text)
+    return text
+
+
+def replace_symbols(text, lang="en"):
+    """Replace symbols based on the lenguage tag.
+
+    Args:
+      text:
+       Input text.
+      lang:
+        Lenguage identifier. ex: "en", "fr", "pt", "ca".
+
+    Returns:
+      The modified text
+      example:
+        input args:
+            text: "si l'avi cau, diguem-ho"
+            lang: "ca"
+        Output:
+            text: "si lavi cau, diguemho"
+    """
+    text = text.replace(";", ",")
+    text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
+    text = text.replace(":", ",")
+    if lang == "en":
+        text = text.replace("&", " and ")
+    elif lang == "fr":
+        text = text.replace("&", " et ")
+    elif lang == "pt":
+        text = text.replace("&", " e ")
+    elif lang == "ca":
+        text = text.replace("&", " i ")
+        text = text.replace("'", "")
+    return text
+
+
+def basic_cleaners(text):
+    """Basic pipeline that lowercases and collapses whitespace without transliteration."""
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+def transliteration_cleaners(text):
+    """Pipeline for non-English text that transliterates to ASCII."""
+    # text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+def basic_german_cleaners(text):
+    """Pipeline for German text"""
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+# TODO: elaborate it
+def basic_turkish_cleaners(text):
+    """Pipeline for Turkish text"""
+    text = text.replace("I", "ı")
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+def english_cleaners(text):
+    """Pipeline for English text, including number and abbreviation expansion."""
+    # text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = expand_time_english(text)
+    text = en_normalize_numbers(text)
+    text = expand_abbreviations(text)
+    text = replace_symbols(text)
+    text = remove_aux_symbols(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+def phoneme_cleaners(text):
+    """Pipeline for phonemes mode, including number and abbreviation expansion."""
+    text = en_normalize_numbers(text)
+    text = expand_abbreviations(text)
+    text = replace_symbols(text)
+    text = remove_aux_symbols(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+def french_cleaners(text):
+    """Pipeline for French text. There is no need to expand numbers, phonemizer already does that"""
+    text = expand_abbreviations(text, lang="fr")
+    text = lowercase(text)
+    text = replace_symbols(text, lang="fr")
+    text = remove_aux_symbols(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+def portuguese_cleaners(text):
+    """Basic pipeline for Portuguese text. There is no need to expand abbreviation and
+    numbers, phonemizer already does that"""
+    text = lowercase(text)
+    text = replace_symbols(text, lang="pt")
+    text = remove_aux_symbols(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+def chinese_mandarin_cleaners(text: str) -> str:
+    """Basic pipeline for chinese"""
+    text = replace_numbers_to_characters_in_text(text)
+    return text
+
+
+def multilingual_cleaners(text):
+    """Pipeline for multilingual text"""
+    text = lowercase(text)
+    text = replace_symbols(text, lang=None)
+    text = remove_aux_symbols(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+def no_cleaners(text):
+    # remove newline characters
+    text = text.replace("\n", "")
+    return text
@@ -0,0 +1,151 @@
+# -*- coding: utf-8 -*-
+
+import re
+
+VALID_SYMBOLS = [
+    "AA",
+    "AA0",
+    "AA1",
+    "AA2",
+    "AE",
+    "AE0",
+    "AE1",
+    "AE2",
+    "AH",
+    "AH0",
+    "AH1",
+    "AH2",
+    "AO",
+    "AO0",
+    "AO1",
+    "AO2",
+    "AW",
+    "AW0",
+    "AW1",
+    "AW2",
+    "AY",
+    "AY0",
+    "AY1",
+    "AY2",
+    "B",
+    "CH",
+    "D",
+    "DH",
+    "EH",
+    "EH0",
+    "EH1",
+    "EH2",
+    "ER",
+    "ER0",
+    "ER1",
+    "ER2",
+    "EY",
+    "EY0",
+    "EY1",
+    "EY2",
+    "F",
+    "G",
+    "HH",
+    "IH",
+    "IH0",
+    "IH1",
+    "IH2",
+    "IY",
+    "IY0",
+    "IY1",
+    "IY2",
+    "JH",
+    "K",
+    "L",
+    "M",
+    "N",
+    "NG",
+    "OW",
+    "OW0",
+    "OW1",
+    "OW2",
+    "OY",
+    "OY0",
+    "OY1",
+    "OY2",
+    "P",
+    "R",
+    "S",
+    "SH",
+    "T",
+    "TH",
+    "UH",
+    "UH0",
+    "UH1",
+    "UH2",
+    "UW",
+    "UW0",
+    "UW1",
+    "UW2",
+    "V",
+    "W",
+    "Y",
+    "Z",
+    "ZH",
+]
+
+
+class CMUDict:
+    """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
+
+    def __init__(self, file_or_path, keep_ambiguous=True):
+        if isinstance(file_or_path, str):
+            with open(file_or_path, encoding="latin-1") as f:
+                entries = _parse_cmudict(f)
+        else:
+            entries = _parse_cmudict(file_or_path)
+        if not keep_ambiguous:
+            entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
+        self._entries = entries
+
+    def __len__(self):
+        return len(self._entries)
+
+    def lookup(self, word):
+        """Returns list of ARPAbet pronunciations of the given word."""
+        return self._entries.get(word.upper())
+
+    @staticmethod
+    def get_arpabet(word, cmudict, punctuation_symbols):
+        first_symbol, last_symbol = "", ""
+        if word and word[0] in punctuation_symbols:
+            first_symbol = word[0]
+            word = word[1:]
+        if word and word[-1] in punctuation_symbols:
+            last_symbol = word[-1]
+            word = word[:-1]
+        arpabet = cmudict.lookup(word)
+        if arpabet is not None:
+            return first_symbol + "{%s}" % arpabet[0] + last_symbol
+        return first_symbol + word + last_symbol
+
+
+_alt_re = re.compile(r"\([0-9]+\)")
+
+
+def _parse_cmudict(file):
+    cmudict = {}
+    for line in file:
+        if line and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
+            parts = line.split("  ")
+            word = re.sub(_alt_re, "", parts[0])
+            pronunciation = _get_pronunciation(parts[1])
+            if pronunciation:
+                if word in cmudict:
+                    cmudict[word].append(pronunciation)
+                else:
+                    cmudict[word] = [pronunciation]
+    return cmudict
+
+
+def _get_pronunciation(s):
+    parts = s.strip().split(" ")
+    for part in parts:
+        if part not in VALID_SYMBOLS:
+            return None
+    return " ".join(parts)
@@ -0,0 +1,26 @@
+import re
+
+# List of (regular expression, replacement) pairs for abbreviations in english:
+abbreviations_en = [
+    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+    for x in [
+        ("mrs", "misess"),
+        ("mr", "mister"),
+        ("dr", "doctor"),
+        ("st", "saint"),
+        ("co", "company"),
+        ("jr", "junior"),
+        ("maj", "major"),
+        ("gen", "general"),
+        ("drs", "doctors"),
+        ("rev", "reverend"),
+        ("lt", "lieutenant"),
+        ("hon", "honorable"),
+        ("sgt", "sergeant"),
+        ("capt", "captain"),
+        ("esq", "esquire"),
+        ("ltd", "limited"),
+        ("col", "colonel"),
+        ("ft", "fort"),
+    ]
+]
@@ -0,0 +1,97 @@
+""" from https://github.com/keithito/tacotron """
+
+import re
+from typing import Dict
+
+import inflect
+
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
+_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
+_currency_re = re.compile(r"(£|\$|¥)([0-9\,\.]*[0-9]+)")
+_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
+_number_re = re.compile(r"-?[0-9]+")
+
+
+def _remove_commas(m):
+    return m.group(1).replace(",", "")
+
+
+def _expand_decimal_point(m):
+    return m.group(1).replace(".", " point ")
+
+
+def __expand_currency(value: str, inflection: Dict[float, str]) -> str:
+    parts = value.replace(",", "").split(".")
+    if len(parts) > 2:
+        return f"{value} {inflection[2]}"  # Unexpected format
+    text = []
+    integer = int(parts[0]) if parts[0] else 0
+    if integer > 0:
+        integer_unit = inflection.get(integer, inflection[2])
+        text.append(f"{integer} {integer_unit}")
+    fraction = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if fraction > 0:
+        fraction_unit = inflection.get(fraction / 100, inflection[0.02])
+        text.append(f"{fraction} {fraction_unit}")
+    if len(text) == 0:
+        return f"zero {inflection[2]}"
+    return " ".join(text)
+
+
+def _expand_currency(m: "re.Match") -> str:
+    currencies = {
+        "$": {
+            0.01: "cent",
+            0.02: "cents",
+            1: "dollar",
+            2: "dollars",
+        },
+        "€": {
+            0.01: "cent",
+            0.02: "cents",
+            1: "euro",
+            2: "euros",
+        },
+        "£": {
+            0.01: "penny",
+            0.02: "pence",
+            1: "pound sterling",
+            2: "pounds sterling",
+        },
+        "¥": {
+            # TODO rin
+            0.02: "sen",
+            2: "yen",
+        },
+    }
+    unit = m.group(1)
+    currency = currencies[unit]
+    value = m.group(2)
+    return __expand_currency(value, currency)
+
+
+def _expand_ordinal(m):
+    return _inflect.number_to_words(m.group(0))
+
+
+def _expand_number(m):
+    num = int(m.group(0))
+    if 1000 < num < 3000:
+        if num == 2000:
+            return "two thousand"
+        if 2000 < num < 2010:
+            return "two thousand " + _inflect.number_to_words(num % 100)
+        if num % 100 == 0:
+            return _inflect.number_to_words(num // 100) + " hundred"
+        return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
+    return _inflect.number_to_words(num, andword="")
+
+
+def normalize_numbers(text):
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_currency_re, _expand_currency, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text
@@ -0,0 +1,47 @@
+import re
+
+import inflect
+
+_inflect = inflect.engine()
+
+_time_re = re.compile(
+    r"""\b
+                          ((0?[0-9])|(1[0-1])|(1[2-9])|(2[0-3]))  # hours
+                          :
+                          ([0-5][0-9])                            # minutes
+                          \s*(a\\.m\\.|am|pm|p\\.m\\.|a\\.m|p\\.m)? # am/pm
+                          \b""",
+    re.IGNORECASE | re.X,
+)
+
+
+def _expand_num(n: int) -> str:
+    return _inflect.number_to_words(n)
+
+
+def _expand_time_english(match: "re.Match") -> str:
+    hour = int(match.group(1))
+    past_noon = hour >= 12
+    time = []
+    if hour > 12:
+        hour -= 12
+    elif hour == 0:
+        hour = 12
+        past_noon = True
+    time.append(_expand_num(hour))
+
+    minute = int(match.group(6))
+    if minute > 0:
+        if minute < 10:
+            time.append("oh")
+        time.append(_expand_num(minute))
+    am_pm = match.group(7)
+    if am_pm is None:
+        time.append("p m" if past_noon else "a m")
+    else:
+        time.extend(list(am_pm.replace(".", "")))
+    return " ".join(time)
+
+
+def expand_time_english(text: str) -> str:
+    return re.sub(_time_re, _expand_time_english, text)
@@ -0,0 +1,48 @@
+import re
+
+# List of (regular expression, replacement) pairs for abbreviations in french:
+abbreviations_fr = [
+    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+    for x in [
+        ("M", "monsieur"),
+        ("Mlle", "mademoiselle"),
+        ("Mlles", "mesdemoiselles"),
+        ("Mme", "Madame"),
+        ("Mmes", "Mesdames"),
+        ("N.B", "nota bene"),
+        ("M", "monsieur"),
+        ("p.c.q", "parce que"),
+        ("Pr", "professeur"),
+        ("qqch", "quelque chose"),
+        ("rdv", "rendez-vous"),
+        ("max", "maximum"),
+        ("min", "minimum"),
+        ("no", "numéro"),
+        ("adr", "adresse"),
+        ("dr", "docteur"),
+        ("st", "saint"),
+        ("co", "companie"),
+        ("jr", "junior"),
+        ("sgt", "sergent"),
+        ("capt", "capitain"),
+        ("col", "colonel"),
+        ("av", "avenue"),
+        ("av. J.-C", "avant Jésus-Christ"),
+        ("apr. J.-C", "après Jésus-Christ"),
+        ("art", "article"),
+        ("boul", "boulevard"),
+        ("c.-à-d", "c’est-à-dire"),
+        ("etc", "et cetera"),
+        ("ex", "exemple"),
+        ("excl", "exclusivement"),
+        ("boul", "boulevard"),
+    ]
+] + [
+    (re.compile("\\b%s" % x[0]), x[1])
+    for x in [
+        ("Mlle", "mademoiselle"),
+        ("Mlles", "mesdemoiselles"),
+        ("Mme", "Madame"),
+        ("Mmes", "Mesdames"),
+    ]
+]
@@ -0,0 +1,470 @@
+# Convert Japanese text to phonemes which is
+# compatible with Julius https://github.com/julius-speech/segmentation-kit
+
+import re
+import unicodedata
+
+try:
+    import MeCab
+except ImportError as e:
+    raise ImportError("Japanese requires mecab-python3 and unidic-lite.") from e
+from num2words import num2words
+
+_CONVRULES = [
+    # Conversion of 2 letters
+    "アァ/ a a",
+    "イィ/ i i",
+    "イェ/ i e",
+    "イャ/ y a",
+    "ウゥ/ u:",
+    "エェ/ e e",
+    "オォ/ o:",
+    "カァ/ k a:",
+    "キィ/ k i:",
+    "クゥ/ k u:",
+    "クャ/ ky a",
+    "クュ/ ky u",
+    "クョ/ ky o",
+    "ケェ/ k e:",
+    "コォ/ k o:",
+    "ガァ/ g a:",
+    "ギィ/ g i:",
+    "グゥ/ g u:",
+    "グャ/ gy a",
+    "グュ/ gy u",
+    "グョ/ gy o",
+    "ゲェ/ g e:",
+    "ゴォ/ g o:",
+    "サァ/ s a:",
+    "シィ/ sh i:",
+    "スゥ/ s u:",
+    "スャ/ sh a",
+    "スュ/ sh u",
+    "スョ/ sh o",
+    "セェ/ s e:",
+    "ソォ/ s o:",
+    "ザァ/ z a:",
+    "ジィ/ j i:",
+    "ズゥ/ z u:",
+    "ズャ/ zy a",
+    "ズュ/ zy u",
+    "ズョ/ zy o",
+    "ゼェ/ z e:",
+    "ゾォ/ z o:",
+    "タァ/ t a:",
+    "チィ/ ch i:",
+    "ツァ/ ts a",
+    "ツィ/ ts i",
+    "ツゥ/ ts u:",
+    "ツャ/ ch a",
+    "ツュ/ ch u",
+    "ツョ/ ch o",
+    "ツェ/ ts e",
+    "ツォ/ ts o",
+    "テェ/ t e:",
+    "トォ/ t o:",
+    "ダァ/ d a:",
+    "ヂィ/ j i:",
+    "ヅゥ/ d u:",
+    "ヅャ/ zy a",
+    "ヅュ/ zy u",
+    "ヅョ/ zy o",
+    "デェ/ d e:",
+    "ドォ/ d o:",
+    "ナァ/ n a:",
+    "ニィ/ n i:",
+    "ヌゥ/ n u:",
+    "ヌャ/ ny a",
+    "ヌュ/ ny u",
+    "ヌョ/ ny o",
+    "ネェ/ n e:",
+    "ノォ/ n o:",
+    "ハァ/ h a:",
+    "ヒィ/ h i:",
+    "フゥ/ f u:",
+    "フャ/ hy a",
+    "フュ/ hy u",
+    "フョ/ hy o",
+    "ヘェ/ h e:",
+    "ホォ/ h o:",
+    "バァ/ b a:",
+    "ビィ/ b i:",
+    "ブゥ/ b u:",
+    "フャ/ hy a",
+    "ブュ/ by u",
+    "フョ/ hy o",
+    "ベェ/ b e:",
+    "ボォ/ b o:",
+    "パァ/ p a:",
+    "ピィ/ p i:",
+    "プゥ/ p u:",
+    "プャ/ py a",
+    "プュ/ py u",
+    "プョ/ py o",
+    "ペェ/ p e:",
+    "ポォ/ p o:",
+    "マァ/ m a:",
+    "ミィ/ m i:",
+    "ムゥ/ m u:",
+    "ムャ/ my a",
+    "ムュ/ my u",
+    "ムョ/ my o",
+    "メェ/ m e:",
+    "モォ/ m o:",
+    "ヤァ/ y a:",
+    "ユゥ/ y u:",
+    "ユャ/ y a:",
+    "ユュ/ y u:",
+    "ユョ/ y o:",
+    "ヨォ/ y o:",
+    "ラァ/ r a:",
+    "リィ/ r i:",
+    "ルゥ/ r u:",
+    "ルャ/ ry a",
+    "ルュ/ ry u",
+    "ルョ/ ry o",
+    "レェ/ r e:",
+    "ロォ/ r o:",
+    "ワァ/ w a:",
+    "ヲォ/ o:",
+    "ディ/ d i",
+    "デェ/ d e:",
+    "デャ/ dy a",
+    "デュ/ dy u",
+    "デョ/ dy o",
+    "ティ/ t i",
+    "テェ/ t e:",
+    "テャ/ ty a",
+    "テュ/ ty u",
+    "テョ/ ty o",
+    "スィ/ s i",
+    "ズァ/ z u a",
+    "ズィ/ z i",
+    "ズゥ/ z u",
+    "ズャ/ zy a",
+    "ズュ/ zy u",
+    "ズョ/ zy o",
+    "ズェ/ z e",
+    "ズォ/ z o",
+    "キャ/ ky a",
+    "キュ/ ky u",
+    "キョ/ ky o",
+    "シャ/ sh a",
+    "シュ/ sh u",
+    "シェ/ sh e",
+    "ショ/ sh o",
+    "チャ/ ch a",
+    "チュ/ ch u",
+    "チェ/ ch e",
+    "チョ/ ch o",
+    "トゥ/ t u",
+    "トャ/ ty a",
+    "トュ/ ty u",
+    "トョ/ ty o",
+    "ドァ/ d o a",
+    "ドゥ/ d u",
+    "ドャ/ dy a",
+    "ドュ/ dy u",
+    "ドョ/ dy o",
+    "ドォ/ d o:",
+    "ニャ/ ny a",
+    "ニュ/ ny u",
+    "ニョ/ ny o",
+    "ヒャ/ hy a",
+    "ヒュ/ hy u",
+    "ヒョ/ hy o",
+    "ミャ/ my a",
+    "ミュ/ my u",
+    "ミョ/ my o",
+    "リャ/ ry a",
+    "リュ/ ry u",
+    "リョ/ ry o",
+    "ギャ/ gy a",
+    "ギュ/ gy u",
+    "ギョ/ gy o",
+    "ヂェ/ j e",
+    "ヂャ/ j a",
+    "ヂュ/ j u",
+    "ヂョ/ j o",
+    "ジェ/ j e",
+    "ジャ/ j a",
+    "ジュ/ j u",
+    "ジョ/ j o",
+    "ビャ/ by a",
+    "ビュ/ by u",
+    "ビョ/ by o",
+    "ピャ/ py a",
+    "ピュ/ py u",
+    "ピョ/ py o",
+    "ウァ/ u a",
+    "ウィ/ w i",
+    "ウェ/ w e",
+    "ウォ/ w o",
+    "ファ/ f a",
+    "フィ/ f i",
+    "フゥ/ f u",
+    "フャ/ hy a",
+    "フュ/ hy u",
+    "フョ/ hy o",
+    "フェ/ f e",
+    "フォ/ f o",
+    "ヴァ/ b a",
+    "ヴィ/ b i",
+    "ヴェ/ b e",
+    "ヴォ/ b o",
+    "ヴュ/ by u",
+    # Conversion of 1 letter
+    "ア/ a",
+    "イ/ i",
+    "ウ/ u",
+    "エ/ e",
+    "オ/ o",
+    "カ/ k a",
+    "キ/ k i",
+    "ク/ k u",
+    "ケ/ k e",
+    "コ/ k o",
+    "サ/ s a",
+    "シ/ sh i",
+    "ス/ s u",
+    "セ/ s e",
+    "ソ/ s o",
+    "タ/ t a",
+    "チ/ ch i",
+    "ツ/ ts u",
+    "テ/ t e",
+    "ト/ t o",
+    "ナ/ n a",
+    "ニ/ n i",
+    "ヌ/ n u",
+    "ネ/ n e",
+    "ノ/ n o",
+    "ハ/ h a",
+    "ヒ/ h i",
+    "フ/ f u",
+    "ヘ/ h e",
+    "ホ/ h o",
+    "マ/ m a",
+    "ミ/ m i",
+    "ム/ m u",
+    "メ/ m e",
+    "モ/ m o",
+    "ラ/ r a",
+    "リ/ r i",
+    "ル/ r u",
+    "レ/ r e",
+    "ロ/ r o",
+    "ガ/ g a",
+    "ギ/ g i",
+    "グ/ g u",
+    "ゲ/ g e",
+    "ゴ/ g o",
+    "ザ/ z a",
+    "ジ/ j i",
+    "ズ/ z u",
+    "ゼ/ z e",
+    "ゾ/ z o",
+    "ダ/ d a",
+    "ヂ/ j i",
+    "ヅ/ z u",
+    "デ/ d e",
+    "ド/ d o",
+    "バ/ b a",
+    "ビ/ b i",
+    "ブ/ b u",
+    "ベ/ b e",
+    "ボ/ b o",
+    "パ/ p a",
+    "ピ/ p i",
+    "プ/ p u",
+    "ペ/ p e",
+    "ポ/ p o",
+    "ヤ/ y a",
+    "ユ/ y u",
+    "ヨ/ y o",
+    "ワ/ w a",
+    "ヰ/ i",
+    "ヱ/ e",
+    "ヲ/ o",
+    "ン/ N",
+    "ッ/ q",
+    "ヴ/ b u",
+    "ー/:",
+    # Try converting broken text
+    "ァ/ a",
+    "ィ/ i",
+    "ゥ/ u",
+    "ェ/ e",
+    "ォ/ o",
+    "ヮ/ w a",
+    "ォ/ o",
+    # Symbols
+    "、/ ,",
+    "。/ .",
+    "！/ !",
+    "？/ ?",
+    "・/ ,",
+]
+
+_COLON_RX = re.compile(":+")
+_REJECT_RX = re.compile("[^ a-zA-Z:,.?]")
+
+
+def _makerulemap():
+    l = [tuple(x.split("/")) for x in _CONVRULES]
+    return tuple({k: v for k, v in l if len(k) == i} for i in (1, 2))
+
+
+_RULEMAP1, _RULEMAP2 = _makerulemap()
+
+
+def kata2phoneme(text: str) -> str:
+    """Convert katakana text to phonemes."""
+    text = text.strip()
+    res = ""
+    while text:
+        if len(text) >= 2:
+            x = _RULEMAP2.get(text[:2])
+            if x is not None:
+                text = text[2:]
+                res += x
+                continue
+        x = _RULEMAP1.get(text[0])
+        if x is not None:
+            text = text[1:]
+            res += x
+            continue
+        res += " " + text[0]
+        text = text[1:]
+    res = _COLON_RX.sub(":", res)
+    return res[1:]
+
+
+_KATAKANA = "".join(chr(ch) for ch in range(ord("ァ"), ord("ン") + 1))
+_HIRAGANA = "".join(chr(ch) for ch in range(ord("ぁ"), ord("ん") + 1))
+_HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA)
+
+
+def hira2kata(text: str) -> str:
+    text = text.translate(_HIRA2KATATRANS)
+    return text.replace("う゛", "ヴ")
+
+
+_SYMBOL_TOKENS = set(list("・、。？！"))
+_NO_YOMI_TOKENS = set(list("「」『』―（）［］[]　…"))
+_TAGGER = MeCab.Tagger()
+
+
+def text2kata(text: str) -> str:
+    parsed = _TAGGER.parse(text)
+    res = []
+    for line in parsed.split("\n"):
+        if line == "EOS":
+            break
+        parts = line.split("\t")
+
+        word, yomi = parts[0], parts[1]
+        if yomi:
+            res.append(yomi)
+        else:
+            if word in _SYMBOL_TOKENS:
+                res.append(word)
+            elif word in ("っ", "ッ"):
+                res.append("ッ")
+            elif word in _NO_YOMI_TOKENS:
+                pass
+            else:
+                res.append(word)
+    return hira2kata("".join(res))
+
+
+_ALPHASYMBOL_YOMI = {
+    "#": "シャープ",
+    "%": "パーセント",
+    "&": "アンド",
+    "+": "プラス",
+    "-": "マイナス",
+    ":": "コロン",
+    ";": "セミコロン",
+    "<": "小なり",
+    "=": "イコール",
+    ">": "大なり",
+    "@": "アット",
+    "a": "エー",
+    "b": "ビー",
+    "c": "シー",
+    "d": "ディー",
+    "e": "イー",
+    "f": "エフ",
+    "g": "ジー",
+    "h": "エイチ",
+    "i": "アイ",
+    "j": "ジェー",
+    "k": "ケー",
+    "l": "エル",
+    "m": "エム",
+    "n": "エヌ",
+    "o": "オー",
+    "p": "ピー",
+    "q": "キュー",
+    "r": "アール",
+    "s": "エス",
+    "t": "ティー",
+    "u": "ユー",
+    "v": "ブイ",
+    "w": "ダブリュー",
+    "x": "エックス",
+    "y": "ワイ",
+    "z": "ゼット",
+    "α": "アルファ",
+    "β": "ベータ",
+    "γ": "ガンマ",
+    "δ": "デルタ",
+    "ε": "イプシロン",
+    "ζ": "ゼータ",
+    "η": "イータ",
+    "θ": "シータ",
+    "ι": "イオタ",
+    "κ": "カッパ",
+    "λ": "ラムダ",
+    "μ": "ミュー",
+    "ν": "ニュー",
+    "ξ": "クサイ",
+    "ο": "オミクロン",
+    "π": "パイ",
+    "ρ": "ロー",
+    "σ": "シグマ",
+    "τ": "タウ",
+    "υ": "ウプシロン",
+    "φ": "ファイ",
+    "χ": "カイ",
+    "ψ": "プサイ",
+    "ω": "オメガ",
+}
+
+
+_NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+")
+_CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"}
+_CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])")
+_NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?")
+
+
+def japanese_convert_numbers_to_words(text: str) -> str:
+    res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text)
+    res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res)
+    res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res)
+    return res
+
+
+def japanese_convert_alpha_symbols_to_words(text: str) -> str:
+    return "".join([_ALPHASYMBOL_YOMI.get(ch, ch) for ch in text.lower()])
+
+
+def japanese_text_to_phonemes(text: str) -> str:
+    """Convert Japanese text to phonemes."""
+    res = unicodedata.normalize("NFKC", text)
+    res = japanese_convert_numbers_to_words(res)
+    res = japanese_convert_alpha_symbols_to_words(res)
+    res = text2kata(res)
+    res = kata2phoneme(res)
+    return res.replace(" ", "")
@@ -0,0 +1,44 @@
+# coding: utf-8
+# Add the word you want to the dictionary.
+etc_dictionary = {"1+1": "원플러스원", "2+1": "투플러스원"}
+
+
+english_dictionary = {
+    "KOREA": "코리아",
+    "IDOL": "아이돌",
+    "IT": "아이티",
+    "IQ": "아이큐",
+    "UP": "업",
+    "DOWN": "다운",
+    "PC": "피씨",
+    "CCTV": "씨씨티비",
+    "SNS": "에스엔에스",
+    "AI": "에이아이",
+    "CEO": "씨이오",
+    "A": "에이",
+    "B": "비",
+    "C": "씨",
+    "D": "디",
+    "E": "이",
+    "F": "에프",
+    "G": "지",
+    "H": "에이치",
+    "I": "아이",
+    "J": "제이",
+    "K": "케이",
+    "L": "엘",
+    "M": "엠",
+    "N": "엔",
+    "O": "오",
+    "P": "피",
+    "Q": "큐",
+    "R": "알",
+    "S": "에스",
+    "T": "티",
+    "U": "유",
+    "V": "브이",
+    "W": "더블유",
+    "X": "엑스",
+    "Y": "와이",
+    "Z": "제트",
+}
@@ -0,0 +1,32 @@
+# coding: utf-8
+# Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py
+import re
+
+from TTS.tts.utils.text.korean.ko_dictionary import english_dictionary, etc_dictionary
+
+
+def normalize(text):
+    text = text.strip()
+    text = re.sub("[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text)
+    text = normalize_with_dictionary(text, etc_dictionary)
+    text = normalize_english(text)
+    text = text.lower()
+    return text
+
+
+def normalize_with_dictionary(text, dic):
+    if any(key in text for key in dic.keys()):
+        pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))
+        return pattern.sub(lambda x: dic[x.group()], text)
+    return text
+
+
+def normalize_english(text):
+    def fn(m):
+        word = m.group()
+        if word in english_dictionary:
+            return english_dictionary.get(word)
+        return word
+
+    text = re.sub("([A-Za-z]+)", fn, text)
+    return text
@@ -0,0 +1,36 @@
+from jamo import hangul_to_jamo
+
+from TTS.tts.utils.text.korean.korean import normalize
+
+g2p = None
+
+
+def korean_text_to_phonemes(text, character: str = "hangeul") -> str:
+    """
+
+    The input and output values look the same, but they are different in Unicode.
+
+    example :
+
+        input = '하늘' (Unicode : \ud558\ub298), (하 + 늘)
+        output = '하늘' (Unicode :\u1112\u1161\u1102\u1173\u11af), (ᄒ + ᅡ + ᄂ + ᅳ + ᆯ)
+
+    """
+    global g2p  # pylint: disable=global-statement
+    if g2p is None:
+        from g2pkk import G2p
+
+        g2p = G2p()
+
+    if character == "english":
+        from anyascii import anyascii
+
+        text = normalize(text)
+        text = g2p(text)
+        text = anyascii(text)
+        return text
+
+    text = normalize(text)
+    text = g2p(text)
+    text = list(hangul_to_jamo(text))  # '하늘' --> ['ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆯ']
+    return "".join(text)
@@ -0,0 +1,79 @@
+from TTS.tts.utils.text.phonemizers.bangla_phonemizer import BN_Phonemizer
+from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
+from TTS.tts.utils.text.phonemizers.belarusian_phonemizer import BEL_Phonemizer
+from TTS.tts.utils.text.phonemizers.espeak_wrapper import ESpeak
+from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
+from TTS.tts.utils.text.phonemizers.ko_kr_phonemizer import KO_KR_Phonemizer
+from TTS.tts.utils.text.phonemizers.zh_cn_phonemizer import ZH_CN_Phonemizer
+
+try:
+    from TTS.tts.utils.text.phonemizers.ja_jp_phonemizer import JA_JP_Phonemizer
+except ImportError:
+    JA_JP_Phonemizer = None
+    pass
+
+PHONEMIZERS = {b.name(): b for b in (ESpeak, Gruut, KO_KR_Phonemizer, BN_Phonemizer)}
+
+
+ESPEAK_LANGS = list(ESpeak.supported_languages().keys())
+GRUUT_LANGS = list(Gruut.supported_languages())
+
+
+# Dict setting default phonemizers for each language
+# Add Gruut languages
+_ = [Gruut.name()] * len(GRUUT_LANGS)
+DEF_LANG_TO_PHONEMIZER = dict(list(zip(GRUUT_LANGS, _)))
+
+
+# Add ESpeak languages and override any existing ones
+_ = [ESpeak.name()] * len(ESPEAK_LANGS)
+_new_dict = dict(list(zip(list(ESPEAK_LANGS), _)))
+DEF_LANG_TO_PHONEMIZER.update(_new_dict)
+
+
+# Force default for some languages
+DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"]
+DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name()
+DEF_LANG_TO_PHONEMIZER["ko-kr"] = KO_KR_Phonemizer.name()
+DEF_LANG_TO_PHONEMIZER["bn"] = BN_Phonemizer.name()
+DEF_LANG_TO_PHONEMIZER["be"] = BEL_Phonemizer.name()
+
+
+# JA phonemizer has deal breaking dependencies like MeCab for some systems.
+# So we only have it when we have it.
+if JA_JP_Phonemizer is not None:
+    PHONEMIZERS[JA_JP_Phonemizer.name()] = JA_JP_Phonemizer
+    DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name()
+
+
+def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
+    """Initiate a phonemizer by name
+
+    Args:
+        name (str):
+            Name of the phonemizer that should match `phonemizer.name()`.
+
+        kwargs (dict):
+            Extra keyword arguments that should be passed to the phonemizer.
+    """
+    if name == "espeak":
+        return ESpeak(**kwargs)
+    if name == "gruut":
+        return Gruut(**kwargs)
+    if name == "zh_cn_phonemizer":
+        return ZH_CN_Phonemizer(**kwargs)
+    if name == "ja_jp_phonemizer":
+        if JA_JP_Phonemizer is None:
+            raise ValueError(" ❗ You need to install JA phonemizer dependencies. Try `pip install TTS[ja]`.")
+        return JA_JP_Phonemizer(**kwargs)
+    if name == "ko_kr_phonemizer":
+        return KO_KR_Phonemizer(**kwargs)
+    if name == "bn_phonemizer":
+        return BN_Phonemizer(**kwargs)
+    if name == "be_phonemizer":
+        return BEL_Phonemizer(**kwargs)
+    raise ValueError(f"Phonemizer {name} not found")
+
+
+if __name__ == "__main__":
+    print(DEF_LANG_TO_PHONEMIZER)
@@ -0,0 +1,62 @@
+from typing import Dict
+
+from TTS.tts.utils.text.bangla.phonemizer import bangla_text_to_phonemes
+from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
+
+_DEF_ZH_PUNCS = "、.,[]()?!〽~『』「」【】"
+
+
+class BN_Phonemizer(BasePhonemizer):
+    """🐸TTS bn phonemizer using functions in `TTS.tts.utils.text.bangla.phonemizer`
+
+    Args:
+        punctuations (str):
+            Set of characters to be treated as punctuation. Defaults to `_DEF_ZH_PUNCS`.
+
+        keep_puncs (bool):
+            If True, keep the punctuations after phonemization. Defaults to False.
+
+    Example ::
+
+        "这是，样本中文。" -> `d|ʒ|ø|4| |ʂ|ʏ|4| |，| |i|ɑ|ŋ|4|b|œ|n|3| |d|ʒ|o|ŋ|1|w|œ|n|2| |。`
+
+    TODO: someone with Bangla knowledge should check this implementation
+    """
+
+    language = "bn"
+
+    def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs):  # pylint: disable=unused-argument
+        super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
+
+    @staticmethod
+    def name():
+        return "bn_phonemizer"
+
+    @staticmethod
+    def phonemize_bn(text: str, separator: str = "|") -> str:  # pylint: disable=unused-argument
+        ph = bangla_text_to_phonemes(text)
+        return ph
+
+    def _phonemize(self, text, separator):
+        return self.phonemize_bn(text, separator)
+
+    @staticmethod
+    def supported_languages() -> Dict:
+        return {"bn": "Bangla"}
+
+    def version(self) -> str:
+        return "0.0.1"
+
+    def is_available(self) -> bool:
+        return True
+
+
+if __name__ == "__main__":
+    txt = "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে, কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয়, তখনও যেন বলে."
+    e = BN_Phonemizer()
+    print(e.supported_languages())
+    print(e.version())
+    print(e.language)
+    print(e.name())
+    print(e.is_available())
+    print("`" + e.phonemize(txt) + "`")
@@ -0,0 +1,140 @@
+import abc
+from typing import List, Tuple
+
+from TTS.tts.utils.text.punctuation import Punctuation
+
+
+class BasePhonemizer(abc.ABC):
+    """Base phonemizer class
+
+    Phonemization follows the following steps:
+        1. Preprocessing:
+            - remove empty lines
+            - remove punctuation
+            - keep track of punctuation marks
+
+        2. Phonemization:
+            - convert text to phonemes
+
+        3. Postprocessing:
+            - join phonemes
+            - restore punctuation marks
+
+    Args:
+        language (str):
+            Language used by the phonemizer.
+
+        punctuations (List[str]):
+            List of punctuation marks to be preserved.
+
+        keep_puncs (bool):
+            Whether to preserve punctuation marks or not.
+    """
+
+    def __init__(self, language, punctuations=Punctuation.default_puncs(), keep_puncs=False):
+        # ensure the backend is installed on the system
+        if not self.is_available():
+            raise RuntimeError("{} not installed on your system".format(self.name()))  # pragma: nocover
+
+        # ensure the backend support the requested language
+        self._language = self._init_language(language)
+
+        # setup punctuation processing
+        self._keep_puncs = keep_puncs
+        self._punctuator = Punctuation(punctuations)
+
+    def _init_language(self, language):
+        """Language initialization
+
+        This method may be overloaded in child classes (see Segments backend)
+
+        """
+        if not self.is_supported_language(language):
+            raise RuntimeError(f'language "{language}" is not supported by the ' f"{self.name()} backend")
+        return language
+
+    @property
+    def language(self):
+        """The language code configured to be used for phonemization"""
+        return self._language
+
+    @staticmethod
+    @abc.abstractmethod
+    def name():
+        """The name of the backend"""
+        ...
+
+    @classmethod
+    @abc.abstractmethod
+    def is_available(cls):
+        """Returns True if the backend is installed, False otherwise"""
+        ...
+
+    @classmethod
+    @abc.abstractmethod
+    def version(cls):
+        """Return the backend version as a tuple (major, minor, patch)"""
+        ...
+
+    @staticmethod
+    @abc.abstractmethod
+    def supported_languages():
+        """Return a dict of language codes -> name supported by the backend"""
+        ...
+
+    def is_supported_language(self, language):
+        """Returns True if `language` is supported by the backend"""
+        return language in self.supported_languages()
+
+    @abc.abstractmethod
+    def _phonemize(self, text, separator):
+        """The main phonemization method"""
+
+    def _phonemize_preprocess(self, text) -> Tuple[List[str], List]:
+        """Preprocess the text before phonemization
+
+        1. remove spaces
+        2. remove punctuation
+
+        Override this if you need a different behaviour
+        """
+        text = text.strip()
+        if self._keep_puncs:
+            # a tuple (text, punctuation marks)
+            return self._punctuator.strip_to_restore(text)
+        return [self._punctuator.strip(text)], []
+
+    def _phonemize_postprocess(self, phonemized, punctuations) -> str:
+        """Postprocess the raw phonemized output
+
+        Override this if you need a different behaviour
+        """
+        if self._keep_puncs:
+            return self._punctuator.restore(phonemized, punctuations)[0]
+        return phonemized[0]
+
+    def phonemize(self, text: str, separator="|", language: str = None) -> str:  # pylint: disable=unused-argument
+        """Returns the `text` phonemized for the given language
+
+        Args:
+            text (str):
+                Text to be phonemized.
+
+            separator (str):
+                string separator used between phonemes. Default to '_'.
+
+        Returns:
+            (str): Phonemized text
+        """
+        text, punctuations = self._phonemize_preprocess(text)
+        phonemized = []
+        for t in text:
+            p = self._phonemize(t, separator)
+            phonemized.append(p)
+        phonemized = self._phonemize_postprocess(phonemized, punctuations)
+        return phonemized
+
+    def print_logs(self, level: int = 0):
+        indent = "\t" * level
+        print(f"{indent}| > phoneme language: {self.language}")
+        print(f"{indent}| > phoneme backend: {self.name()}")
@@ -0,0 +1,55 @@
+from typing import Dict
+
+from TTS.tts.utils.text.belarusian.phonemizer import belarusian_text_to_phonemes
+from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
+
+_DEF_BE_PUNCS = ",!."  # TODO
+
+
+class BEL_Phonemizer(BasePhonemizer):
+    """🐸TTS be phonemizer using functions in `TTS.tts.utils.text.belarusian.phonemizer`
+
+    Args:
+        punctuations (str):
+            Set of characters to be treated as punctuation. Defaults to `_DEF_BE_PUNCS`.
+
+        keep_puncs (bool):
+            If True, keep the punctuations after phonemization. Defaults to False.
+    """
+
+    language = "be"
+
+    def __init__(self, punctuations=_DEF_BE_PUNCS, keep_puncs=True, **kwargs):  # pylint: disable=unused-argument
+        super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
+
+    @staticmethod
+    def name():
+        return "be_phonemizer"
+
+    @staticmethod
+    def phonemize_be(text: str, separator: str = "|") -> str:  # pylint: disable=unused-argument
+        return belarusian_text_to_phonemes(text)
+
+    def _phonemize(self, text, separator):
+        return self.phonemize_be(text, separator)
+
+    @staticmethod
+    def supported_languages() -> Dict:
+        return {"be": "Belarusian"}
+
+    def version(self) -> str:
+        return "0.0.1"
+
+    def is_available(self) -> bool:
+        return True
+
+
+if __name__ == "__main__":
+    txt = "тэст"
+    e = BEL_Phonemizer()
+    print(e.supported_languages())
+    print(e.version())
+    print(e.language)
+    print(e.name())
+    print(e.is_available())
+    print("`" + e.phonemize(txt) + "`")
@@ -0,0 +1,264 @@
+import logging
+import re
+import subprocess
+from typing import Dict, List
+
+from packaging.version import Version
+
+from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
+from TTS.tts.utils.text.punctuation import Punctuation
+
+
+def is_tool(name):
+    from shutil import which
+
+    return which(name) is not None
+
+
+# Use a regex pattern to match the espeak version, because it may be
+# symlinked to espeak-ng, which moves the version bits to another spot.
+espeak_version_pattern = re.compile(r"text-to-speech:\s(?P<version>\d+\.\d+(\.\d+)?)")
+
+
+def get_espeak_version():
+    output = subprocess.getoutput("espeak --version")
+    match = espeak_version_pattern.search(output)
+
+    return match.group("version")
+
+
+def get_espeakng_version():
+    output = subprocess.getoutput("espeak-ng --version")
+    return output.split()[3]
+
+
+# priority: espeakng > espeak
+if is_tool("espeak-ng"):
+    _DEF_ESPEAK_LIB = "espeak-ng"
+    _DEF_ESPEAK_VER = get_espeakng_version()
+elif is_tool("espeak"):
+    _DEF_ESPEAK_LIB = "espeak"
+    _DEF_ESPEAK_VER = get_espeak_version()
+else:
+    _DEF_ESPEAK_LIB = None
+    _DEF_ESPEAK_VER = None
+
+
+def _espeak_exe(espeak_lib: str, args: List, sync=False) -> List[str]:
+    """Run espeak with the given arguments."""
+    cmd = [
+        espeak_lib,
+        "-q",
+        "-b",
+        "1",  # UTF8 text encoding
+    ]
+    cmd.extend(args)
+    logging.debug("espeakng: executing %s", repr(cmd))
+
+    with subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+    ) as p:
+        res = iter(p.stdout.readline, b"")
+        if not sync:
+            p.stdout.close()
+            if p.stderr:
+                p.stderr.close()
+            if p.stdin:
+                p.stdin.close()
+            return res
+        res2 = []
+        for line in res:
+            res2.append(line)
+        p.stdout.close()
+        if p.stderr:
+            p.stderr.close()
+        if p.stdin:
+            p.stdin.close()
+        p.wait()
+    return res2
+
+
+class ESpeak(BasePhonemizer):
+    """ESpeak wrapper calling `espeak` or `espeak-ng` from the command-line the perform G2P
+
+    Args:
+        language (str):
+            Valid language code for the used backend.
+
+        backend (str):
+            Name of the backend library to use. `espeak` or `espeak-ng`. If None, set automatically
+            prefering `espeak-ng` over `espeak`. Defaults to None.
+
+        punctuations (str):
+            Characters to be treated as punctuation. Defaults to Punctuation.default_puncs().
+
+        keep_puncs (bool):
+            If True, keep the punctuations after phonemization. Defaults to True.
+
+    Example:
+
+        >>> from TTS.tts.utils.text.phonemizers import ESpeak
+        >>> phonemizer = ESpeak("tr")
+        >>> phonemizer.phonemize("Bu Türkçe, bir örnektir.", separator="|")
+        'b|ʊ t|ˈø|r|k|tʃ|ɛ, b|ɪ|r œ|r|n|ˈɛ|c|t|ɪ|r.'
+
+    """
+
+    _ESPEAK_LIB = _DEF_ESPEAK_LIB
+    _ESPEAK_VER = _DEF_ESPEAK_VER
+
+    def __init__(self, language: str, backend=None, punctuations=Punctuation.default_puncs(), keep_puncs=True):
+        if self._ESPEAK_LIB is None:
+            raise Exception(" [!] No espeak backend found. Install espeak-ng or espeak to your system.")
+        self.backend = self._ESPEAK_LIB
+
+        # band-aid for backwards compatibility
+        if language == "en":
+            language = "en-us"
+        if language == "zh-cn":
+            language = "cmn"
+
+        super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs)
+        if backend is not None:
+            self.backend = backend
+
+    @property
+    def backend(self):
+        return self._ESPEAK_LIB
+
+    @property
+    def backend_version(self):
+        return self._ESPEAK_VER
+
+    @backend.setter
+    def backend(self, backend):
+        if backend not in ["espeak", "espeak-ng"]:
+            raise Exception("Unknown backend: %s" % backend)
+        self._ESPEAK_LIB = backend
+        self._ESPEAK_VER = get_espeakng_version() if backend == "espeak-ng" else get_espeak_version()
+
+    def auto_set_espeak_lib(self) -> None:
+        if is_tool("espeak-ng"):
+            self._ESPEAK_LIB = "espeak-ng"
+            self._ESPEAK_VER = get_espeakng_version()
+        elif is_tool("espeak"):
+            self._ESPEAK_LIB = "espeak"
+            self._ESPEAK_VER = get_espeak_version()
+        else:
+            raise Exception("Cannot set backend automatically. espeak-ng or espeak not found")
+
+    @staticmethod
+    def name():
+        return "espeak"
+
+    def phonemize_espeak(self, text: str, separator: str = "|", tie=False) -> str:
+        """Convert input text to phonemes.
+
+        Args:
+            text (str):
+                Text to be converted to phonemes.
+
+            tie (bool, optional) : When True use a '͡' character between
+                consecutive characters of a single phoneme. Else separate phoneme
+                with '_'. This option requires espeak>=1.49. Default to False.
+        """
+        # set arguments
+        args = ["-v", f"{self._language}"]
+        # espeak and espeak-ng parses `ipa` differently
+        if tie:
+            # use '͡' between phonemes
+            if self.backend == "espeak":
+                args.append("--ipa=1")
+            else:
+                args.append("--ipa=3")
+        else:
+            # split with '_'
+            if self.backend == "espeak":
+                if Version(self.backend_version) >= Version("1.48.15"):
+                    args.append("--ipa=1")
+                else:
+                    args.append("--ipa=3")
+            else:
+                args.append("--ipa=1")
+        if tie:
+            args.append("--tie=%s" % tie)
+
+        args.append(text)
+        # compute phonemes
+        phonemes = ""
+        for line in _espeak_exe(self._ESPEAK_LIB, args, sync=True):
+            logging.debug("line: %s", repr(line))
+            ph_decoded = line.decode("utf8").strip()
+            # espeak:
+            #   version 1.48.15: " p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
+            # espeak-ng:
+            #   "p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
+
+            # espeak-ng backend can add language flags that need to be removed:
+            #   "sɛʁtˈɛ̃ mˈo kɔm (en)fˈʊtbɔːl(fr) ʒenˈɛʁ de- flˈaɡ də- lˈɑ̃ɡ."
+            # phonemize needs to remove the language flags of the returned text:
+            #   "sɛʁtˈɛ̃ mˈo kɔm fˈʊtbɔːl ʒenˈɛʁ de- flˈaɡ də- lˈɑ̃ɡ."
+            ph_decoded = re.sub(r"\(.+?\)", "", ph_decoded)
+
+            phonemes += ph_decoded.strip()
+        return phonemes.replace("_", separator)
+
+    def _phonemize(self, text, separator=None):
+        return self.phonemize_espeak(text, separator, tie=False)
+
+    @staticmethod
+    def supported_languages() -> Dict:
+        """Get a dictionary of supported languages.
+
+        Returns:
+            Dict: Dictionary of language codes.
+        """
+        if _DEF_ESPEAK_LIB is None:
+            return {}
+        args = ["--voices"]
+        langs = {}
+        count = 0
+        for line in _espeak_exe(_DEF_ESPEAK_LIB, args, sync=True):
+            line = line.decode("utf8").strip()
+            if count > 0:
+                cols = line.split()
+                lang_code = cols[1]
+                lang_name = cols[3]
+                langs[lang_code] = lang_name
+            logging.debug("line: %s", repr(line))
+            count += 1
+        return langs
+
+    def version(self) -> str:
+        """Get the version of the used backend.
+
+        Returns:
+            str: Version of the used backend.
+        """
+        args = ["--version"]
+        for line in _espeak_exe(self.backend, args, sync=True):
+            version = line.decode("utf8").strip().split()[2]
+            logging.debug("line: %s", repr(line))
+            return version
+
+    @classmethod
+    def is_available(cls):
+        """Return true if ESpeak is available else false"""
+        return is_tool("espeak") or is_tool("espeak-ng")
+
+
+if __name__ == "__main__":
+    e = ESpeak(language="en-us")
+    print(e.supported_languages())
+    print(e.version())
+    print(e.language)
+    print(e.name())
+    print(e.is_available())
+
+    e = ESpeak(language="en-us", keep_puncs=False)
+    print("`" + e.phonemize("hello how are you today?") + "`")
+
+    e = ESpeak(language="en-us", keep_puncs=True)
+    print("`" + e.phonemize("hello how are you today?") + "`")
@@ -0,0 +1,151 @@
+import importlib
+from typing import List
+
+import gruut
+from gruut_ipa import IPA
+
+from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
+from TTS.tts.utils.text.punctuation import Punctuation
+
+# Table for str.translate to fix gruut/TTS phoneme mismatch
+GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ")
+
+
+class Gruut(BasePhonemizer):
+    """Gruut wrapper for G2P
+
+    Args:
+        language (str):
+            Valid language code for the used backend.
+
+        punctuations (str):
+            Characters to be treated as punctuation. Defaults to `Punctuation.default_puncs()`.
+
+        keep_puncs (bool):
+            If true, keep the punctuations after phonemization. Defaults to True.
+
+        use_espeak_phonemes (bool):
+            If true, use espeak lexicons instead of default Gruut lexicons. Defaults to False.
+
+        keep_stress (bool):
+            If true, keep the stress characters after phonemization. Defaults to False.
+
+    Example:
+
+        >>> from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
+        >>> phonemizer = Gruut('en-us')
+        >>> phonemizer.phonemize("Be a voice, not an! echo?", separator="|")
+        'b|i| ə| v|ɔ|ɪ|s, n|ɑ|t| ə|n! ɛ|k|o|ʊ?'
+    """
+
+    def __init__(
+        self,
+        language: str,
+        punctuations=Punctuation.default_puncs(),
+        keep_puncs=True,
+        use_espeak_phonemes=False,
+        keep_stress=False,
+    ):
+        super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs)
+        self.use_espeak_phonemes = use_espeak_phonemes
+        self.keep_stress = keep_stress
+
+    @staticmethod
+    def name():
+        return "gruut"
+
+    def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str:  # pylint: disable=unused-argument
+        """Convert input text to phonemes.
+
+        Gruut phonemizes the given `str` by seperating each phoneme character with `separator`, even for characters
+        that constitude a single sound.
+
+        It doesn't affect 🐸TTS since it individually converts each character to token IDs.
+
+        Examples::
+            "hello how are you today?" -> `h|ɛ|l|o|ʊ| h|a|ʊ| ɑ|ɹ| j|u| t|ə|d|e|ɪ`
+
+        Args:
+            text (str):
+                Text to be converted to phonemes.
+
+            tie (bool, optional) : When True use a '͡' character between
+                consecutive characters of a single phoneme. Else separate phoneme
+                with '_'. This option requires espeak>=1.49. Default to False.
+        """
+        ph_list = []
+        for sentence in gruut.sentences(text, lang=self.language, espeak=self.use_espeak_phonemes):
+            for word in sentence:
+                if word.is_break:
+                    # Use actual character for break phoneme (e.g., comma)
+                    if ph_list:
+                        # Join with previous word
+                        ph_list[-1].append(word.text)
+                    else:
+                        # First word is punctuation
+                        ph_list.append([word.text])
+                elif word.phonemes:
+                    # Add phonemes for word
+                    word_phonemes = []
+
+                    for word_phoneme in word.phonemes:
+                        if not self.keep_stress:
+                            # Remove primary/secondary stress
+                            word_phoneme = IPA.without_stress(word_phoneme)
+
+                        word_phoneme = word_phoneme.translate(GRUUT_TRANS_TABLE)
+
+                        if word_phoneme:
+                            # Flatten phonemes
+                            word_phonemes.extend(word_phoneme)
+
+                    if word_phonemes:
+                        ph_list.append(word_phonemes)
+
+        ph_words = [separator.join(word_phonemes) for word_phonemes in ph_list]
+        ph = f"{separator} ".join(ph_words)
+        return ph
+
+    def _phonemize(self, text, separator):
+        return self.phonemize_gruut(text, separator, tie=False)
+
+    def is_supported_language(self, language):
+        """Returns True if `language` is supported by the backend"""
+        return gruut.is_language_supported(language)
+
+    @staticmethod
+    def supported_languages() -> List:
+        """Get a dictionary of supported languages.
+
+        Returns:
+            List: List of language codes.
+        """
+        return list(gruut.get_supported_languages())
+
+    def version(self):
+        """Get the version of the used backend.
+
+        Returns:
+            str: Version of the used backend.
+        """
+        return gruut.__version__
+
+    @classmethod
+    def is_available(cls):
+        """Return true if ESpeak is available else false"""
+        return importlib.util.find_spec("gruut") is not None
+
+
+if __name__ == "__main__":
+    e = Gruut(language="en-us")
+    print(e.supported_languages())
+    print(e.version())
+    print(e.language)
+    print(e.name())
+    print(e.is_available())
+
+    e = Gruut(language="en-us", keep_puncs=False)
+    print("`" + e.phonemize("hello how are you today?") + "`")
+
+    e = Gruut(language="en-us", keep_puncs=True)
+    print("`" + e.phonemize("hello how, are you today?") + "`")
@@ -0,0 +1,72 @@
+from typing import Dict
+
+from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes
+from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
+
+_DEF_JA_PUNCS = "、.,[]()?!〽~『』「」【】"
+
+_TRANS_TABLE = {"、": ","}
+
+
+def trans(text):
+    for i, j in _TRANS_TABLE.items():
+        text = text.replace(i, j)
+    return text
+
+
+class JA_JP_Phonemizer(BasePhonemizer):
+    """🐸TTS Ja-Jp phonemizer using functions in `TTS.tts.utils.text.japanese.phonemizer`
+
+    TODO: someone with JA knowledge should check this implementation
+
+    Example:
+
+        >>> from TTS.tts.utils.text.phonemizers import JA_JP_Phonemizer
+        >>> phonemizer = JA_JP_Phonemizer()
+        >>> phonemizer.phonemize("どちらに行きますか？", separator="|")
+        'd|o|c|h|i|r|a|n|i|i|k|i|m|a|s|u|k|a|?'
+
+    """
+
+    language = "ja-jp"
+
+    def __init__(self, punctuations=_DEF_JA_PUNCS, keep_puncs=True, **kwargs):  # pylint: disable=unused-argument
+        super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
+
+    @staticmethod
+    def name():
+        return "ja_jp_phonemizer"
+
+    def _phonemize(self, text: str, separator: str = "|") -> str:
+        ph = japanese_text_to_phonemes(text)
+        if separator is not None or separator != "":
+            return separator.join(ph)
+        return ph
+
+    def phonemize(self, text: str, separator="|", language=None) -> str:
+        """Custom phonemize for JP_JA
+
+        Skip pre-post processing steps used by the other phonemizers.
+        """
+        return self._phonemize(text, separator)
+
+    @staticmethod
+    def supported_languages() -> Dict:
+        return {"ja-jp": "Japanese (Japan)"}
+
+    def version(self) -> str:
+        return "0.0.1"
+
+    def is_available(self) -> bool:
+        return True
+
+
+# if __name__ == "__main__":
+#     text = "これは、電話をかけるための私の日本語の例のテキストです。"
+#     e = JA_JP_Phonemizer()
+#     print(e.supported_languages())
+#     print(e.version())
+#     print(e.language)
+#     print(e.name())
+#     print(e.is_available())
+#     print("`" + e.phonemize(text) + "`")
@@ -0,0 +1,65 @@
+from typing import Dict
+
+from TTS.tts.utils.text.korean.phonemizer import korean_text_to_phonemes
+from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
+
+_DEF_KO_PUNCS = "、.,[]()?!〽~『』「」【】"
+
+
+class KO_KR_Phonemizer(BasePhonemizer):
+    """🐸TTS ko_kr_phonemizer using functions in `TTS.tts.utils.text.korean.phonemizer`
+
+    TODO: Add Korean to character (ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄋᄌᄍᄎᄏᄐᄑ하ᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵᆨᆩᆪᆫᆬᆭᆮᆯᆰᆱᆲᆳᆴᆵᆶᆷᆸᆹᆺᆻᆼᆽᆾᆿᇀᇁᇂ)
+
+    Example:
+
+        >>> from TTS.tts.utils.text.phonemizers import KO_KR_Phonemizer
+        >>> phonemizer = KO_KR_Phonemizer()
+        >>> phonemizer.phonemize("이 문장은 음성합성 테스트를 위한 문장입니다.", separator="|")
+        'ᄋ|ᅵ| |ᄆ|ᅮ|ᆫ|ᄌ|ᅡ|ᆼ|ᄋ|ᅳ| |ᄂ|ᅳ|ᆷ|ᄉ|ᅥ|ᆼ|ᄒ|ᅡ|ᆸ|ᄊ|ᅥ|ᆼ| |ᄐ|ᅦ|ᄉ|ᅳ|ᄐ|ᅳ|ᄅ|ᅳ| |ᄅ|ᅱ|ᄒ|ᅡ|ᆫ| |ᄆ|ᅮ|ᆫ|ᄌ|ᅡ|ᆼ|ᄋ|ᅵ|ᆷ|ᄂ|ᅵ|ᄃ|ᅡ|.'
+
+        >>> from TTS.tts.utils.text.phonemizers import KO_KR_Phonemizer
+        >>> phonemizer = KO_KR_Phonemizer()
+        >>> phonemizer.phonemize("이 문장은 음성합성 테스트를 위한 문장입니다.", separator="|", character='english')
+        'I| |M|u|n|J|a|n|g|E|u| |N|e|u|m|S|e|o|n|g|H|a|b|S|s|e|o|n|g| |T|e|S|e|u|T|e|u|L|e|u| |L|w|i|H|a|n| |M|u|n|J|a|n|g|I|m|N|i|D|a|.'
+
+    """
+
+    language = "ko-kr"
+
+    def __init__(self, punctuations=_DEF_KO_PUNCS, keep_puncs=True, **kwargs):  # pylint: disable=unused-argument
+        super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
+
+    @staticmethod
+    def name():
+        return "ko_kr_phonemizer"
+
+    def _phonemize(self, text: str, separator: str = "", character: str = "hangeul") -> str:
+        ph = korean_text_to_phonemes(text, character=character)
+        if separator is not None or separator != "":
+            return separator.join(ph)
+        return ph
+
+    def phonemize(self, text: str, separator: str = "", character: str = "hangeul", language=None) -> str:
+        return self._phonemize(text, separator, character)
+
+    @staticmethod
+    def supported_languages() -> Dict:
+        return {"ko-kr": "hangeul(korean)"}
+
+    def version(self) -> str:
+        return "0.0.2"
+
+    def is_available(self) -> bool:
+        return True
+
+
+if __name__ == "__main__":
+    texts = "이 문장은 음성합성 테스트를 위한 문장입니다."
+    e = KO_KR_Phonemizer()
+    print(e.supported_languages())
+    print(e.version())
+    print(e.language)
+    print(e.name())
+    print(e.is_available())
+    print(e.phonemize(texts))
@@ -0,0 +1,65 @@
+from typing import Dict, List
+
+from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name
+
+
+class MultiPhonemizer:
+    """🐸TTS multi-phonemizer that operates phonemizers for multiple langugages
+
+    Args:
+        custom_lang_to_phonemizer (Dict):
+            Custom phonemizer mapping if you want to change the defaults. In the format of
+            `{"lang_code", "phonemizer_name"}`. When it is None, `DEF_LANG_TO_PHONEMIZER` is used. Defaults to `{}`.
+
+    TODO: find a way to pass custom kwargs to the phonemizers
+    """
+
+    lang_to_phonemizer = {}
+
+    def __init__(self, lang_to_phonemizer_name: Dict = {}) -> None:  # pylint: disable=dangerous-default-value
+        for k, v in lang_to_phonemizer_name.items():
+            if v == "" and k in DEF_LANG_TO_PHONEMIZER.keys():
+                lang_to_phonemizer_name[k] = DEF_LANG_TO_PHONEMIZER[k]
+            elif v == "":
+                raise ValueError(f"Phonemizer wasn't set for language {k} and doesn't have a default.")
+        self.lang_to_phonemizer_name = lang_to_phonemizer_name
+        self.lang_to_phonemizer = self.init_phonemizers(self.lang_to_phonemizer_name)
+
+    @staticmethod
+    def init_phonemizers(lang_to_phonemizer_name: Dict) -> Dict:
+        lang_to_phonemizer = {}
+        for k, v in lang_to_phonemizer_name.items():
+            lang_to_phonemizer[k] = get_phonemizer_by_name(v, language=k)
+        return lang_to_phonemizer
+
+    @staticmethod
+    def name():
+        return "multi-phonemizer"
+
+    def phonemize(self, text, separator="|", language=""):
+        if language == "":
+            raise ValueError("Language must be set for multi-phonemizer to phonemize.")
+        return self.lang_to_phonemizer[language].phonemize(text, separator)
+
+    def supported_languages(self) -> List:
+        return list(self.lang_to_phonemizer.keys())
+
+    def print_logs(self, level: int = 0):
+        indent = "\t" * level
+        print(f"{indent}| > phoneme language: {self.supported_languages()}")
+        print(f"{indent}| > phoneme backend: {self.name()}")
+
+
+# if __name__ == "__main__":
+#     texts = {
+#         "tr": "Merhaba, bu Türkçe bit örnek!",
+#         "en-us": "Hello, this is English example!",
+#         "de": "Hallo, das ist ein Deutches Beipiel!",
+#         "zh-cn": "这是中国的例子",
+#     }
+#     phonemes = {}
+#     ph = MultiPhonemizer({"tr": "espeak", "en-us": "", "de": "gruut", "zh-cn": ""})
+#     for lang, text in texts.items():
+#         phoneme = ph.phonemize(text, lang)
+#         phonemes[lang] = phoneme
+#     print(phonemes)
@@ -0,0 +1,62 @@
+from typing import Dict
+
+from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes
+from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
+
+_DEF_ZH_PUNCS = "、.,[]()?!〽~『』「」【】"
+
+
+class ZH_CN_Phonemizer(BasePhonemizer):
+    """🐸TTS Zh-Cn phonemizer using functions in `TTS.tts.utils.text.chinese_mandarin.phonemizer`
+
+    Args:
+        punctuations (str):
+            Set of characters to be treated as punctuation. Defaults to `_DEF_ZH_PUNCS`.
+
+        keep_puncs (bool):
+            If True, keep the punctuations after phonemization. Defaults to False.
+
+    Example ::
+
+        "这是，样本中文。" -> `d|ʒ|ø|4| |ʂ|ʏ|4| |，| |i|ɑ|ŋ|4|b|œ|n|3| |d|ʒ|o|ŋ|1|w|œ|n|2| |。`
+
+    TODO: someone with Mandarin knowledge should check this implementation
+    """
+
+    language = "zh-cn"
+
+    def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs):  # pylint: disable=unused-argument
+        super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
+
+    @staticmethod
+    def name():
+        return "zh_cn_phonemizer"
+
+    @staticmethod
+    def phonemize_zh_cn(text: str, separator: str = "|") -> str:
+        ph = chinese_text_to_phonemes(text, separator)
+        return ph
+
+    def _phonemize(self, text, separator):
+        return self.phonemize_zh_cn(text, separator)
+
+    @staticmethod
+    def supported_languages() -> Dict:
+        return {"zh-cn": "Chinese (China)"}
+
+    def version(self) -> str:
+        return "0.0.1"
+
+    def is_available(self) -> bool:
+        return True
+
+
+# if __name__ == "__main__":
+#     text = "这是，样本中文。"
+#     e = ZH_CN_Phonemizer()
+#     print(e.supported_languages())
+#     print(e.version())
+#     print(e.language)
+#     print(e.name())
+#     print(e.is_available())
+#     print("`" + e.phonemize(text) + "`")
@@ -0,0 +1,171 @@
+import collections
+import re
+from enum import Enum
+
+import six
+
+_DEF_PUNCS = ';:,.!?¡¿—…"«»“”'
+
+_PUNC_IDX = collections.namedtuple("_punc_index", ["punc", "position"])
+
+
+class PuncPosition(Enum):
+    """Enum for the punctuations positions"""
+
+    BEGIN = 0
+    END = 1
+    MIDDLE = 2
+
+
+class Punctuation:
+    """Handle punctuations in text.
+
+    Just strip punctuations from text or strip and restore them later.
+
+    Args:
+        puncs (str): The punctuations to be processed. Defaults to `_DEF_PUNCS`.
+
+    Example:
+        >>> punc = Punctuation()
+        >>> punc.strip("This is. example !")
+        'This is example'
+
+        >>> text_striped, punc_map = punc.strip_to_restore("This is. example !")
+        >>> ' '.join(text_striped)
+        'This is example'
+
+        >>> text_restored = punc.restore(text_striped, punc_map)
+        >>> text_restored[0]
+        'This is. example !'
+    """
+
+    def __init__(self, puncs: str = _DEF_PUNCS):
+        self.puncs = puncs
+
+    @staticmethod
+    def default_puncs():
+        """Return default set of punctuations."""
+        return _DEF_PUNCS
+
+    @property
+    def puncs(self):
+        return self._puncs
+
+    @puncs.setter
+    def puncs(self, value):
+        if not isinstance(value, six.string_types):
+            raise ValueError("[!] Punctuations must be of type str.")
+        self._puncs = "".join(list(dict.fromkeys(list(value))))  # remove duplicates without changing the oreder
+        self.puncs_regular_exp = re.compile(rf"(\s*[{re.escape(self._puncs)}]+\s*)+")
+
+    def strip(self, text):
+        """Remove all the punctuations by replacing with `space`.
+
+        Args:
+            text (str): The text to be processed.
+
+        Example::
+
+            "This is. example !" -> "This is example "
+        """
+        return re.sub(self.puncs_regular_exp, " ", text).rstrip().lstrip()
+
+    def strip_to_restore(self, text):
+        """Remove punctuations from text to restore them later.
+
+        Args:
+            text (str): The text to be processed.
+
+        Examples ::
+
+            "This is. example !" -> [["This is", "example"], [".", "!"]]
+
+        """
+        text, puncs = self._strip_to_restore(text)
+        return text, puncs
+
+    def _strip_to_restore(self, text):
+        """Auxiliary method for Punctuation.preserve()"""
+        matches = list(re.finditer(self.puncs_regular_exp, text))
+        if not matches:
+            return [text], []
+        # the text is only punctuations
+        if len(matches) == 1 and matches[0].group() == text:
+            return [], [_PUNC_IDX(text, PuncPosition.BEGIN)]
+        # build a punctuation map to be used later to restore punctuations
+        puncs = []
+        for match in matches:
+            position = PuncPosition.MIDDLE
+            if match == matches[0] and text.startswith(match.group()):
+                position = PuncPosition.BEGIN
+            elif match == matches[-1] and text.endswith(match.group()):
+                position = PuncPosition.END
+            puncs.append(_PUNC_IDX(match.group(), position))
+        # convert str text to a List[str], each item is separated by a punctuation
+        splitted_text = []
+        for idx, punc in enumerate(puncs):
+            split = text.split(punc.punc)
+            prefix, suffix = split[0], punc.punc.join(split[1:])
+            text = suffix
+            if prefix == "":
+                # We don't want to insert an empty string in case of initial punctuation
+                continue
+            splitted_text.append(prefix)
+            # if the text does not end with a punctuation, add it to the last item
+            if idx == len(puncs) - 1 and len(suffix) > 0:
+                splitted_text.append(suffix)
+        return splitted_text, puncs
+
+    @classmethod
+    def restore(cls, text, puncs):
+        """Restore punctuation in a text.
+
+        Args:
+            text (str): The text to be processed.
+            puncs (List[str]): The list of punctuations map to be used for restoring.
+
+        Examples ::
+
+            ['This is', 'example'], ['.', '!'] -> "This is. example!"
+
+        """
+        return cls._restore(text, puncs)
+
+    @classmethod
+    def _restore(cls, text, puncs):  # pylint: disable=too-many-return-statements
+        """Auxiliary method for Punctuation.restore()"""
+        if not puncs:
+            return text
+
+        # nothing have been phonemized, returns the puncs alone
+        if not text:
+            return ["".join(m.punc for m in puncs)]
+
+        current = puncs[0]
+
+        if current.position == PuncPosition.BEGIN:
+            return cls._restore([current.punc + text[0]] + text[1:], puncs[1:])
+
+        if current.position == PuncPosition.END:
+            return [text[0] + current.punc] + cls._restore(text[1:], puncs[1:])
+
+        # POSITION == MIDDLE
+        if len(text) == 1:  # pragma: nocover
+            # a corner case where the final part of an intermediate
+            # mark (I) has not been phonemized
+            return cls._restore([text[0] + current.punc], puncs[1:])
+
+        return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:])
+
+
+# if __name__ == "__main__":
+#     punc = Punctuation()
+#     text = "This is. This is, example!"
+
+#     print(punc.strip(text))
+
+#     split_text, puncs = punc.strip_to_restore(text)
+#     print(split_text, " ---- ", puncs)
+
+#     restored_text = punc.restore(split_text, puncs)
+#     print(restored_text)
@@ -0,0 +1,216 @@
+from typing import Callable, Dict, List, Union
+
+from TTS.tts.utils.text import cleaners
+from TTS.tts.utils.text.characters import Graphemes, IPAPhonemes
+from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name
+from TTS.tts.utils.text.phonemizers.multi_phonemizer import MultiPhonemizer
+from TTS.utils.generic_utils import get_import_path, import_class
+
+
+class TTSTokenizer:
+    """🐸TTS tokenizer to convert input characters to token IDs and back.
+
+    Token IDs for OOV chars are discarded but those are stored in `self.not_found_characters` for later.
+
+    Args:
+        use_phonemes (bool):
+            Whether to use phonemes instead of characters. Defaults to False.
+
+        characters (Characters):
+            A Characters object to use for character-to-ID and ID-to-character mappings.
+
+        text_cleaner (callable):
+            A function to pre-process the text before tokenization and phonemization. Defaults to None.
+
+        phonemizer (Phonemizer):
+            A phonemizer object or a dict that maps language codes to phonemizer objects. Defaults to None.
+
+    Example:
+
+        >>> from TTS.tts.utils.text.tokenizer import TTSTokenizer
+        >>> tokenizer = TTSTokenizer(use_phonemes=False, characters=Graphemes())
+        >>> text = "Hello world!"
+        >>> ids = tokenizer.text_to_ids(text)
+        >>> text_hat = tokenizer.ids_to_text(ids)
+        >>> assert text == text_hat
+    """
+
+    def __init__(
+        self,
+        use_phonemes=False,
+        text_cleaner: Callable = None,
+        characters: "BaseCharacters" = None,
+        phonemizer: Union["Phonemizer", Dict] = None,
+        add_blank: bool = False,
+        use_eos_bos=False,
+    ):
+        self.text_cleaner = text_cleaner
+        self.use_phonemes = use_phonemes
+        self.add_blank = add_blank
+        self.use_eos_bos = use_eos_bos
+        self.characters = characters
+        self.not_found_characters = []
+        self.phonemizer = phonemizer
+
+    @property
+    def characters(self):
+        return self._characters
+
+    @characters.setter
+    def characters(self, new_characters):
+        self._characters = new_characters
+        self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None
+        self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None
+
+    def encode(self, text: str) -> List[int]:
+        """Encodes a string of text as a sequence of IDs."""
+        token_ids = []
+        for char in text:
+            try:
+                idx = self.characters.char_to_id(char)
+                token_ids.append(idx)
+            except KeyError:
+                # discard but store not found characters
+                if char not in self.not_found_characters:
+                    self.not_found_characters.append(char)
+                    print(text)
+                    print(f" [!] Character {repr(char)} not found in the vocabulary. Discarding it.")
+        return token_ids
+
+    def decode(self, token_ids: List[int]) -> str:
+        """Decodes a sequence of IDs to a string of text."""
+        text = ""
+        for token_id in token_ids:
+            text += self.characters.id_to_char(token_id)
+        return text
+
+    def text_to_ids(self, text: str, language: str = None) -> List[int]:  # pylint: disable=unused-argument
+        """Converts a string of text to a sequence of token IDs.
+
+        Args:
+            text(str):
+                The text to convert to token IDs.
+
+            language(str):
+                The language code of the text. Defaults to None.
+
+        TODO:
+            - Add support for language-specific processing.
+
+        1. Text normalizatin
+        2. Phonemization (if use_phonemes is True)
+        3. Add blank char between characters
+        4. Add BOS and EOS characters
+        5. Text to token IDs
+        """
+        # TODO: text cleaner should pick the right routine based on the language
+        if self.text_cleaner is not None:
+            text = self.text_cleaner(text)
+        if self.use_phonemes:
+            text = self.phonemizer.phonemize(text, separator="", language=language)
+        text = self.encode(text)
+        if self.add_blank:
+            text = self.intersperse_blank_char(text, True)
+        if self.use_eos_bos:
+            text = self.pad_with_bos_eos(text)
+        return text
+
+    def ids_to_text(self, id_sequence: List[int]) -> str:
+        """Converts a sequence of token IDs to a string of text."""
+        return self.decode(id_sequence)
+
+    def pad_with_bos_eos(self, char_sequence: List[str]):
+        """Pads a sequence with the special BOS and EOS characters."""
+        return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id]
+
+    def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False):
+        """Intersperses the blank character between characters in a sequence.
+
+        Use the ```blank``` character if defined else use the ```pad``` character.
+        """
+        char_to_use = self.characters.blank_id if use_blank_char else self.characters.pad
+        result = [char_to_use] * (len(char_sequence) * 2 + 1)
+        result[1::2] = char_sequence
+        return result
+
+    def print_logs(self, level: int = 0):
+        indent = "\t" * level
+        print(f"{indent}| > add_blank: {self.add_blank}")
+        print(f"{indent}| > use_eos_bos: {self.use_eos_bos}")
+        print(f"{indent}| > use_phonemes: {self.use_phonemes}")
+        if self.use_phonemes:
+            print(f"{indent}| > phonemizer:")
+            self.phonemizer.print_logs(level + 1)
+        if len(self.not_found_characters) > 0:
+            print(f"{indent}| > {len(self.not_found_characters)} not found characters:")
+            for char in self.not_found_characters:
+                print(f"{indent}| > {char}")
+
+    @staticmethod
+    def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
+        """Init Tokenizer object from config
+
+        Args:
+            config (Coqpit): Coqpit model config.
+            characters (BaseCharacters): Defines the model character set. If not set, use the default options based on
+                the config values. Defaults to None.
+        """
+        # init cleaners
+        text_cleaner = None
+        if isinstance(config.text_cleaner, (str, list)):
+            text_cleaner = getattr(cleaners, config.text_cleaner)
+
+        # init characters
+        if characters is None:
+            # set characters based on defined characters class
+            if config.characters and config.characters.characters_class:
+                CharactersClass = import_class(config.characters.characters_class)
+                characters, new_config = CharactersClass.init_from_config(config)
+            # set characters based on config
+            else:
+                if config.use_phonemes:
+                    # init phoneme set
+                    characters, new_config = IPAPhonemes().init_from_config(config)
+                else:
+                    # init character set
+                    characters, new_config = Graphemes().init_from_config(config)
+
+        else:
+            characters, new_config = characters.init_from_config(config)
+
+        # set characters class
+        new_config.characters.characters_class = get_import_path(characters)
+
+        # init phonemizer
+        phonemizer = None
+        if config.use_phonemes:
+            if "phonemizer" in config and config.phonemizer == "multi_phonemizer":
+                lang_to_phonemizer_name = {}
+                for dataset in config.datasets:
+                    if dataset.language != "":
+                        lang_to_phonemizer_name[dataset.language] = dataset.phonemizer
+                    else:
+                        raise ValueError("Multi phonemizer requires language to be set for each dataset.")
+                phonemizer = MultiPhonemizer(lang_to_phonemizer_name)
+            else:
+                phonemizer_kwargs = {"language": config.phoneme_language}
+                if "phonemizer" in config and config.phonemizer:
+                    phonemizer = get_phonemizer_by_name(config.phonemizer, **phonemizer_kwargs)
+                else:
+                    try:
+                        phonemizer = get_phonemizer_by_name(
+                            DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs
+                        )
+                        new_config.phonemizer = phonemizer.name()
+                    except KeyError as e:
+                        raise ValueError(
+                            f"""No phonemizer found for language {config.phoneme_language}.
+                            You may need to install a third party library for this language."""
+                        ) from e
+
+        return (
+            TTSTokenizer(
+                config.use_phonemes, text_cleaner, characters, phonemizer, config.add_blank, config.enable_eos_bos_chars
+            ),
+            new_config,
+        )
@@ -0,0 +1,238 @@
+import librosa
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from matplotlib.colors import LogNorm
+
+matplotlib.use("Agg")
+
+
+def plot_alignment(alignment, info=None, fig_size=(16, 10), title=None, output_fig=False, plot_log=False):
+    if isinstance(alignment, torch.Tensor):
+        alignment_ = alignment.detach().cpu().numpy().squeeze()
+    else:
+        alignment_ = alignment
+    alignment_ = alignment_.astype(np.float32) if alignment_.dtype == np.float16 else alignment_
+    fig, ax = plt.subplots(figsize=fig_size)
+    im = ax.imshow(
+        alignment_.T, aspect="auto", origin="lower", interpolation="none", norm=LogNorm() if plot_log else None
+    )
+    fig.colorbar(im, ax=ax)
+    xlabel = "Decoder timestep"
+    if info is not None:
+        xlabel += "\n\n" + info
+    plt.xlabel(xlabel)
+    plt.ylabel("Encoder timestep")
+    # plt.yticks(range(len(text)), list(text))
+    plt.tight_layout()
+    if title is not None:
+        plt.title(title)
+    if not output_fig:
+        plt.close()
+    return fig
+
+
+def plot_spectrogram(spectrogram, ap=None, fig_size=(16, 10), output_fig=False):
+    if isinstance(spectrogram, torch.Tensor):
+        spectrogram_ = spectrogram.detach().cpu().numpy().squeeze().T
+    else:
+        spectrogram_ = spectrogram.T
+    spectrogram_ = spectrogram_.astype(np.float32) if spectrogram_.dtype == np.float16 else spectrogram_
+    if ap is not None:
+        spectrogram_ = ap.denormalize(spectrogram_)  # pylint: disable=protected-access
+    fig = plt.figure(figsize=fig_size)
+    plt.imshow(spectrogram_, aspect="auto", origin="lower")
+    plt.colorbar()
+    plt.tight_layout()
+    if not output_fig:
+        plt.close()
+    return fig
+
+
+def plot_pitch(pitch, spectrogram, ap=None, fig_size=(30, 10), output_fig=False):
+    """Plot pitch curves on top of the spectrogram.
+
+    Args:
+        pitch (np.array): Pitch values.
+        spectrogram (np.array): Spectrogram values.
+
+    Shapes:
+        pitch: :math:`(T,)`
+        spec: :math:`(C, T)`
+    """
+
+    if isinstance(spectrogram, torch.Tensor):
+        spectrogram_ = spectrogram.detach().cpu().numpy().squeeze().T
+    else:
+        spectrogram_ = spectrogram.T
+    spectrogram_ = spectrogram_.astype(np.float32) if spectrogram_.dtype == np.float16 else spectrogram_
+    if ap is not None:
+        spectrogram_ = ap.denormalize(spectrogram_)  # pylint: disable=protected-access
+
+    old_fig_size = plt.rcParams["figure.figsize"]
+    if fig_size is not None:
+        plt.rcParams["figure.figsize"] = fig_size
+
+    fig, ax = plt.subplots()
+
+    ax.imshow(spectrogram_, aspect="auto", origin="lower")
+    ax.set_xlabel("time")
+    ax.set_ylabel("spec_freq")
+
+    ax2 = ax.twinx()
+    ax2.plot(pitch, linewidth=5.0, color="red")
+    ax2.set_ylabel("F0")
+
+    plt.rcParams["figure.figsize"] = old_fig_size
+    if not output_fig:
+        plt.close()
+    return fig
+
+
+def plot_avg_pitch(pitch, chars, fig_size=(30, 10), output_fig=False):
+    """Plot pitch curves on top of the input characters.
+
+    Args:
+        pitch (np.array): Pitch values.
+        chars (str): Characters to place to the x-axis.
+
+    Shapes:
+        pitch: :math:`(T,)`
+    """
+    old_fig_size = plt.rcParams["figure.figsize"]
+    if fig_size is not None:
+        plt.rcParams["figure.figsize"] = fig_size
+
+    fig, ax = plt.subplots()
+
+    x = np.array(range(len(chars)))
+    my_xticks = chars
+    plt.xticks(x, my_xticks)
+
+    ax.set_xlabel("characters")
+    ax.set_ylabel("freq")
+
+    ax2 = ax.twinx()
+    ax2.plot(pitch, linewidth=5.0, color="red")
+    ax2.set_ylabel("F0")
+
+    plt.rcParams["figure.figsize"] = old_fig_size
+    if not output_fig:
+        plt.close()
+    return fig
+
+
+def plot_avg_energy(energy, chars, fig_size=(30, 10), output_fig=False):
+    """Plot energy curves on top of the input characters.
+
+    Args:
+        energy (np.array): energy values.
+        chars (str): Characters to place to the x-axis.
+
+    Shapes:
+        energy: :math:`(T,)`
+    """
+    old_fig_size = plt.rcParams["figure.figsize"]
+    if fig_size is not None:
+        plt.rcParams["figure.figsize"] = fig_size
+
+    fig, ax = plt.subplots()
+
+    x = np.array(range(len(chars)))
+    my_xticks = chars
+    plt.xticks(x, my_xticks)
+
+    ax.set_xlabel("characters")
+    ax.set_ylabel("freq")
+
+    ax2 = ax.twinx()
+    ax2.plot(energy, linewidth=5.0, color="red")
+    ax2.set_ylabel("energy")
+
+    plt.rcParams["figure.figsize"] = old_fig_size
+    if not output_fig:
+        plt.close()
+    return fig
+
+
+def visualize(
+    alignment,
+    postnet_output,
+    text,
+    hop_length,
+    CONFIG,
+    tokenizer,
+    stop_tokens=None,
+    decoder_output=None,
+    output_path=None,
+    figsize=(8, 24),
+    output_fig=False,
+):
+    """Intended to be used in Notebooks."""
+
+    if decoder_output is not None:
+        num_plot = 4
+    else:
+        num_plot = 3
+
+    label_fontsize = 16
+    fig = plt.figure(figsize=figsize)
+
+    plt.subplot(num_plot, 1, 1)
+    plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
+    plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
+    plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
+    # compute phoneme representation and back
+    if CONFIG.use_phonemes:
+        seq = tokenizer.text_to_ids(text)
+        text = tokenizer.ids_to_text(seq)
+        print(text)
+    plt.yticks(range(len(text)), list(text))
+    plt.colorbar()
+
+    if stop_tokens is not None:
+        # plot stopnet predictions
+        plt.subplot(num_plot, 1, 2)
+        plt.plot(range(len(stop_tokens)), list(stop_tokens))
+
+    # plot postnet spectrogram
+    plt.subplot(num_plot, 1, 3)
+    librosa.display.specshow(
+        postnet_output.T,
+        sr=CONFIG.audio["sample_rate"],
+        hop_length=hop_length,
+        x_axis="time",
+        y_axis="linear",
+        fmin=CONFIG.audio["mel_fmin"],
+        fmax=CONFIG.audio["mel_fmax"],
+    )
+
+    plt.xlabel("Time", fontsize=label_fontsize)
+    plt.ylabel("Hz", fontsize=label_fontsize)
+    plt.tight_layout()
+    plt.colorbar()
+
+    if decoder_output is not None:
+        plt.subplot(num_plot, 1, 4)
+        librosa.display.specshow(
+            decoder_output.T,
+            sr=CONFIG.audio["sample_rate"],
+            hop_length=hop_length,
+            x_axis="time",
+            y_axis="linear",
+            fmin=CONFIG.audio["mel_fmin"],
+            fmax=CONFIG.audio["mel_fmax"],
+        )
+        plt.xlabel("Time", fontsize=label_fontsize)
+        plt.ylabel("Hz", fontsize=label_fontsize)
+        plt.tight_layout()
+        plt.colorbar()
+
+    if output_path:
+        print(output_path)
+        fig.savefig(output_path)
+        plt.close()
+
+    if not output_fig:
+        plt.close()
				`@@ -0,0 +1 @@`
				{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[STOP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SPACE]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[STOP]":0,"[UNK]":1,"[SPACE]":2,"!":3,"'":4,"(":5,")":6,",":7,"-":8,".":9,"/":10,":":11,";":12,"?":13,"a":14,"b":15,"c":16,"d":17,"e":18,"f":19,"g":20,"h":21,"i":22,"j":23,"k":24,"l":25,"m":26,"n":27,"o":28,"p":29,"q":30,"r":31,"s":32,"t":33,"u":34,"v":35,"w":36,"x":37,"y":38,"z":39,"th":40,"in":41,"the":42,"an":43,"er":44,"ou":45,"re":46,"on":47,"at":48,"ed":49,"en":50,"to":51,"ing":52,"and":53,"is":54,"as":55,"al":56,"or":57,"of":58,"ar":59,"it":60,"es":61,"he":62,"st":63,"le":64,"om":65,"se":66,"be":67,"ad":68,"ow":69,"ly":70,"ch":71,"wh":72,"that":73,"you":74,"li":75,"ve":76,"ac":77,"ti":78,"ld":79,"me":80,"was":81,"gh":82,"id":83,"ll":84,"wi":85,"ent":86,"for":87,"ay":88,"ro":89,"ver":90,"ic":91,"her":92,"ke":93,"his":94,"no":95,"ut":96,"un":97,"ir":98,"lo":99,"we":100,"ri":101,"ha":102,"with":103,"ght":104,"out":105,"im":106,"ion":107,"all":108,"ab":109,"one":110,"ne":111,"ge":112,"ould":113,"ter":114,"mo":115,"had":116,"ce":117,"she":118,"go":119,"sh":120,"ur":121,"am":122,"so":123,"pe":124,"my":125,"de":126,"are":127,"but":128,"ome":129,"fr":130,"ther":131,"fe":132,"su":133,"do":134,"con":135,"te":136,"ain":137,"ere":138,"po":139,"if":140,"they":141,"us":142,"ag":143,"tr":144,"now":145,"oun":146,"this":147,"have":148,"not":149,"sa":150,"il":151,"up":152,"thing":153,"from":154,"ap":155,"him":156,"ack":157,"ation":158,"ant":159,"our":160,"op":161,"like":162,"ust":163,"ess":164,"bo":165,"ok":166,"ul":167,"ind":168,"ex":169,"com":170,"some":171,"there":172,"ers":173,"co":174,"res":175,"man":176,"ard":177,"pl":178,"wor":179,"way":180,"tion":181,"fo":182,"ca":183,"were":184,"by":185,"ate":186,"pro":187,"ted":188,"ound":189,"own":190,"would":191,"ts":192,"what":193,"qu":194,"ally":195,"ight":196,"ck":197,"gr":198,"when":199,"ven":200,"can":201,"ough":202,"ine":203,"end":204,"per":205,"ous":206,"od":207,"ide":208,"know":209,"ty":210,"very":211,"si":212,"ak":213,"who":214,"about":215,"ill":216,"them":217,"est":218,"red":219,"ye":220,"could":221,"ong":222,"your":223,"their":224,"em":225,"just":226,"other":227,"into":228,"any":229,"whi":230,"um":231,"tw":232,"ast":233,"der":234,"did":235,"ie":236,"been":237,"ace":238,"ink":239,"ity":240,"back":241,"ting":242,"br":243,"more":244,"ake":245,"pp":246,"then":247,"sp":248,"el":249,"use":250,"bl":251,"said":252,"over":253,"get":254},"merges":["t h","i n","th e","a n","e r","o u","r e","o n","a t","e d","e n","t o","in g","an d","i s","a s","a l","o r","o f","a r","i t","e s","h e","s t","l e","o m","s e","b e","a d","o w","l y","c h","w h","th at","y ou","l i","v e","a c","t i","l d","m e","w as","g h","i d","l l","w i","en t","f or","a y","r o","v er","i c","h er","k e","h is","n o","u t","u n","i r","l o","w e","r i","h a","wi th","gh t","ou t","i m","i on","al l","a b","on e","n e","g e","ou ld","t er","m o","h ad","c e","s he","g o","s h","u r","a m","s o","p e","m y","d e","a re","b ut","om e","f r","the r","f e","s u","d o","c on","t e","a in","er e","p o","i f","the y","u s","a g","t r","n ow","ou n","th is","ha ve","no t","s a","i l","u p","th ing","fr om","a p","h im","ac k","at ion","an t","ou r","o p","li ke","u st","es s","b o","o k","u l","in d","e x","c om","s ome","the re","er s","c o","re s","m an","ar d","p l","w or","w ay","ti on","f o","c a","w ere","b y","at e","p ro","t ed","oun d","ow n","w ould","t s","wh at","q u","al ly","i ght","c k","g r","wh en","v en","c an","ou gh","in e","en d","p er","ou s","o d","id e","k now","t y","ver y","s i","a k","wh o","ab out","i ll","the m","es t","re d","y e","c ould","on g","you r","the ir","e m","j ust","o ther","in to","an y","wh i","u m","t w","as t","d er","d id","i e","be en","ac e","in k","it y","b ack","t ing","b r","mo re","a ke","p p","the n","s p","e l","u se","b l","sa id","o ver","ge t"]}}
				`@@ -0,0 +1 @@`
				`from TTS.tts.utils.text.tokenizer import TTSTokenizer`