From 4270a88cc87b5ced392472c466949a665c239907 Mon Sep 17 00:00:00 2001 From: Bryan Bonvallet Date: Tue, 2 Jan 2024 13:25:50 -0800 Subject: [PATCH 1/2] Update api.py tts() return_deterministic_state now includes generated conditioning alongside inputs --- tortoise/api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tortoise/api.py b/tortoise/api.py index 69807b1..f46c42b 100644 --- a/tortoise/api.py +++ b/tortoise/api.py @@ -386,6 +386,7 @@ class TextToSpeech: auto_conditioning, diffusion_conditioning = conditioning_latents else: auto_conditioning, diffusion_conditioning = self.get_random_conditioning_latents() + debug_conditioning = (auto_conditioning, diffusion_conditioning) auto_conditioning = auto_conditioning.to(self.device) diffusion_conditioning = diffusion_conditioning.to(self.device) @@ -581,7 +582,7 @@ class TextToSpeech: res = wav_candidates[0] if return_deterministic_state: - return res, (deterministic_seed, text, voice_samples, conditioning_latents) + return res, (deterministic_seed, text, voice_samples, conditioning_latents, debug_conditioning) else: return res def deterministic_state(self, seed=None): From 7729eaac2084f60b0484be2025e771050b513414 Mon Sep 17 00:00:00 2001 From: Bryan Bonvallet Date: Sat, 6 Jan 2024 13:11:58 -0800 Subject: [PATCH 2/2] Wrap api.tts in Gradio --- tortoise/gradiowrapper.py | 102 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 tortoise/gradiowrapper.py diff --git a/tortoise/gradiowrapper.py b/tortoise/gradiowrapper.py new file mode 100644 index 0000000..99d07ed --- /dev/null +++ b/tortoise/gradiowrapper.py @@ -0,0 +1,102 @@ +import torch +import torchaudio +import datetime +import tempfile +import gradio as gr +#import ffmpegio +# import numpy as np + +from api import TextToSpeech, MODELS_DIR +from utils.audio import load_voices + +tts = TextToSpeech(models_dir=MODELS_DIR, use_deepspeed=False, kv_cache=True, half=True) + +title = "TortoiseTTS UI" +description = "TUDDLE over Gradio" +article = "

Github Repo

" + +examples = [ +] + +# where is this coded in the tts generative model code? +sample_rate = 24000 + +def inference(speakers, text, seed, diterations): + #get_debug_info = True if speakers == 'random' else False + get_debug_info = True + + if ',' in speakers: + voice_sel = speakers.split(',') + else: + voice_sel = [speakers] + voice_samples, conditioning_latents = load_voices(voice_sel) + + if seed < 0: + seed = None + + start = datetime.datetime.now() + + # k is how many samples to run + # cvvp amount above 0 if you need to reduce multiple speakers + # max_mel_tokens is the max number of 1/20 second length tokens used by something under the hood and impacts output duration. 500 ~= 25 seconds + retval = tts.tts(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, + num_autoregressive_samples=96, diffusion_iterations=int(diterations), max_mel_tokens=500, + use_deterministic_seed=seed, cvvp_amount=0.0, return_deterministic_state=get_debug_info) + debug_info = None + conditioning_latents = None + if get_debug_info: + gen, debug_info = retval + conditioning_latents = debug_info[4] + with tempfile.NamedTemporaryFile(suffix=".pth", delete=False) as fp: + torch.save(conditioning_latents, fp.name) + debug_info = fp.name + else: + gen = retval + + if isinstance(gen, list): + raise gr.Error("Keep k=1 to generate a single audio file.") + + audio_array = gen.squeeze(0).cpu() + + #with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as fp: + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: + torchaudio.save(fp.name, audio_array, sample_rate) + print(fp.name, debug_info) + print("duration", datetime.datetime.now() - start) + return (fp.name, debug_info) + #with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as mp3fp: + # ffmpegio.transcode(fp.name, mp3fp.name, overwrite=True) + # return mp3fp.name + +gr.Interface( + fn=inference, + inputs=[ + gr.components.Textbox( + label="Speaker", + value="random" + ), + gr.components.Textbox( + label="Text", + value="Hello, my dog is cute", + ), + gr.components.Number( + label="Seed", + value=-1, + ), + gr.components.Number( + label="DIterations", + value=80, + minimum=30, + maximum=400, + ), + ], + outputs=[ + gr.components.Audio(label="Speech", type="filepath"), + gr.components.File(label="Latent from Random", type="file"), + ], + title=title, + description=description, + article=article, + examples=examples, + allow_flagging='never', + ).launch(debug=False, enable_queue=True, server_name="0.0.0.0")