From 4270a88cc87b5ced392472c466949a665c239907 Mon Sep 17 00:00:00 2001
From: Bryan Bonvallet <btbonval@gmail.com>
Date: Tue, 2 Jan 2024 13:25:50 -0800
Subject: [PATCH 1/2] Update api.py

tts() return_deterministic_state now includes generated conditioning alongside inputs
---
 tortoise/api.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/tortoise/api.py b/tortoise/api.py
index 69807b1..f46c42b 100644
--- a/tortoise/api.py
+++ b/tortoise/api.py
@@ -386,6 +386,7 @@ class TextToSpeech:
             auto_conditioning, diffusion_conditioning = conditioning_latents
         else:
             auto_conditioning, diffusion_conditioning = self.get_random_conditioning_latents()
+        debug_conditioning = (auto_conditioning, diffusion_conditioning)
         auto_conditioning = auto_conditioning.to(self.device)
         diffusion_conditioning = diffusion_conditioning.to(self.device)
 
@@ -581,7 +582,7 @@ class TextToSpeech:
                 res = wav_candidates[0]
 
             if return_deterministic_state:
-                return res, (deterministic_seed, text, voice_samples, conditioning_latents)
+                return res, (deterministic_seed, text, voice_samples, conditioning_latents, debug_conditioning)
             else:
                 return res
     def deterministic_state(self, seed=None):

From 7729eaac2084f60b0484be2025e771050b513414 Mon Sep 17 00:00:00 2001
From: Bryan Bonvallet <btbonval@gmail.com>
Date: Sat, 6 Jan 2024 13:11:58 -0800
Subject: [PATCH 2/2] Wrap api.tts in Gradio

---
 tortoise/gradiowrapper.py | 102 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100644 tortoise/gradiowrapper.py

diff --git a/tortoise/gradiowrapper.py b/tortoise/gradiowrapper.py
new file mode 100644
index 0000000..99d07ed
--- /dev/null
+++ b/tortoise/gradiowrapper.py
@@ -0,0 +1,102 @@
+import torch
+import torchaudio
+import datetime
+import tempfile
+import gradio as gr
+#import ffmpegio
+# import numpy as np
+
+from api import TextToSpeech, MODELS_DIR
+from utils.audio import load_voices
+
+tts = TextToSpeech(models_dir=MODELS_DIR, use_deepspeed=False, kv_cache=True, half=True)
+
+title = "TortoiseTTS UI"
+description = "TUDDLE over Gradio"
+article = "<p style='text-align: center'><a href='https://github.com/neonbjb/tortoise-tts' target='_blank' class='footer'>Github Repo</a></p>"
+
+examples = [
+]
+
+# where is this coded in the tts generative model code?
+sample_rate = 24000
+
+def inference(speakers, text, seed, diterations):
+    #get_debug_info = True if speakers == 'random' else False
+    get_debug_info = True
+
+    if ',' in speakers:
+        voice_sel = speakers.split(',')
+    else:
+        voice_sel = [speakers]
+    voice_samples, conditioning_latents = load_voices(voice_sel)
+
+    if seed < 0:
+        seed = None
+
+    start = datetime.datetime.now()
+
+    # k is how many samples to run
+    # cvvp amount above 0 if you need to reduce multiple speakers
+    # max_mel_tokens is the max number of 1/20 second length tokens used by something under the hood and impacts output duration. 500 ~= 25 seconds
+    retval = tts.tts(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents,
+                             num_autoregressive_samples=96, diffusion_iterations=int(diterations), max_mel_tokens=500,
+                             use_deterministic_seed=seed, cvvp_amount=0.0, return_deterministic_state=get_debug_info)
+    debug_info = None
+    conditioning_latents = None
+    if get_debug_info:
+        gen, debug_info = retval
+        conditioning_latents = debug_info[4]
+        with tempfile.NamedTemporaryFile(suffix=".pth", delete=False) as fp:
+            torch.save(conditioning_latents, fp.name)
+            debug_info = fp.name
+    else:
+        gen = retval
+
+    if isinstance(gen, list):
+        raise gr.Error("Keep k=1 to generate a single audio file.")
+
+    audio_array = gen.squeeze(0).cpu()
+
+    #with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as fp:
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
+        torchaudio.save(fp.name, audio_array, sample_rate)
+        print(fp.name, debug_info)
+        print("duration", datetime.datetime.now() - start)
+        return (fp.name, debug_info)
+        #with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as mp3fp:
+        #    ffmpegio.transcode(fp.name, mp3fp.name, overwrite=True)
+        #    return mp3fp.name
+
+gr.Interface(
+    fn=inference,
+    inputs=[
+        gr.components.Textbox(
+            label="Speaker",
+            value="random"
+        ),
+        gr.components.Textbox(
+            label="Text",
+            value="Hello, my dog is cute",
+        ),
+        gr.components.Number(
+            label="Seed",
+            value=-1,
+        ),
+        gr.components.Number(
+            label="DIterations",
+            value=80,
+            minimum=30,
+            maximum=400,
+        ),
+    ],
+    outputs=[
+        gr.components.Audio(label="Speech", type="filepath"),
+        gr.components.File(label="Latent from Random", type="file"),
+    ],
+    title=title,
+    description=description,
+    article=article,
+    examples=examples,
+    allow_flagging='never',
+    ).launch(debug=False, enable_queue=True, server_name="0.0.0.0")