ComfyUI-DeepFuze/tts_generation.py

import os
import torch
import argparse
from TTS.api import TTS

def main():
    parser = argparse.ArgumentParser(description="Run TTS with specified parameters.")

    parser.add_argument('--model', type=str, default="tts_models/multilingual/multi-dataset/xtts_v2", help="The TTS model to use.")
    parser.add_argument('--text', type=str, required=True, help="The text to be converted to speech.")
    parser.add_argument('--speaker_wav', type=str, required=True, help="The path to the speaker's wav file for voice cloning.")
    parser.add_argument('--language', type=str, default="en", help="The language of the text.")
    parser.add_argument('--output_file', type=str, default="output.wav", help="The output file path for the synthesized speech.")
    parser.add_argument('--device', type=str, choices=["cpu", "mps","cuda"], default="cpu" if torch.cuda.is_available() else "cpu", help="The device to run the model on.")

    args = parser.parse_args()

    # Init TTS
    tts = TTS()
    tts.load_tts_model_by_path(model_path=args.model,config_path=os.path.join(args.model,"config.json"))
    # tts.to(args.device)

    # Run TTS and save to file
    tts.tts_to_file(text=args.text, speaker_wav=args.speaker_wav, language=args.language, file_path=args.output_file)

if __name__ == "__main__":
    main()