From 7000b899ecae701a25e130b26022b35d8109ab69 Mon Sep 17 00:00:00 2001
From: manmay-nakhashi <manmay.nakhashi@gmail.com>
Date: Sun, 30 Jul 2023 13:02:50 +0530
Subject: [PATCH] bug fixes and added kv_cache to do_tts

---
 tortoise/api.py    | 12 +++++-------
 tortoise/do_tts.py |  7 ++++---
 tortoise/read.py   |  2 +-
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/tortoise/api.py b/tortoise/api.py
index 1ee924a..efa01fb 100644
--- a/tortoise/api.py
+++ b/tortoise/api.py
@@ -144,13 +144,12 @@ def fix_autoregressive_output(codes, stop_token, complain=True):
     return codes
 
 
-def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, speaking_rate = 1.0, temperature=1, verbose=True):
+def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True):
     """
     Uses the specified diffusion model to convert discrete codes into a spectrogram.
     """
     with torch.no_grad():
         output_seq_len = latents.shape[1] * 4 * 24000 // 22050  # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
-        output_seq_len = round(output_seq_len * speaking_rate)
         output_shape = (latents.shape[0], 100, output_seq_len)
         precomputed_embeddings = diffusion_model.timestep_independent(latents, conditioning_latents, output_seq_len, False)
 
@@ -310,7 +309,7 @@ class TextToSpeech:
         with torch.no_grad():
             return self.rlg_auto(torch.tensor([0.0])), self.rlg_diffusion(torch.tensor([0.0]))
 
-    def tts_with_preset(self, text, speaking_rate=1.0, preset='fast', **kwargs):
+    def tts_with_preset(self, text, preset='fast', **kwargs):
         """
         Calls TTS with one of a set of preset generation parameters. Options:
             'ultra_fast': Produces speech at a speed which belies the name of this repo. (Not really, but it's definitely fastest).
@@ -331,9 +330,9 @@ class TextToSpeech:
         }
         settings.update(presets[preset])
         settings.update(kwargs) # allow overriding of preset settings with kwargs
-        return self.tts(text, speaking_rate=speaking_rate,**settings)
+        return self.tts(text, **settings)
 
-    def tts(self, text, speaking_rate=1.0, voice_samples=None, conditioning_latents=None, k=1, verbose=True, use_deterministic_seed=None,
+    def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True, use_deterministic_seed=None,
             return_deterministic_state=False,
             # autoregressive generation parameters follow
             num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8, max_mel_tokens=500,
@@ -498,8 +497,7 @@ class TextToSpeech:
                             latents = latents[:, :k]
                             break
 
-                    mel = do_spectrogram_diffusion(diffusion, diffuser, latents, diffusion_conditioning,
-                                                speaking_rate=speaking_rate, temperature=diffusion_temperature, 
+                    mel = do_spectrogram_diffusion(diffusion, diffuser, latents, diffusion_conditioning, temperature=diffusion_temperature, 
                                                 verbose=verbose)
                     wav = vocoder.inference(mel)
                     wav_candidates.append(wav.cpu())
diff --git a/tortoise/do_tts.py b/tortoise/do_tts.py
index c47ae63..39acab3 100644
--- a/tortoise/do_tts.py
+++ b/tortoise/do_tts.py
@@ -14,18 +14,19 @@ if __name__ == '__main__':
                                                  'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='random')
     parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='fast')
     parser.add_argument('--use_deepspeed', type=str, help='Which voice preset to use.', default=False)
+    parser.add_argument('--kv_cache', type=bool, help='If you disable this please wait for a long a time to get the output', default=True)
+    parser.add_argument('--half', type=bool, help="float16(half) precision inference if True it's faster and take less vram and ram", default=True)
     parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
     parser.add_argument('--model_dir', type=str, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this'
                                                       'should only be specified if you have custom checkpoints.', default=MODELS_DIR)
     parser.add_argument('--candidates', type=int, help='How many output candidates to produce per-voice.', default=3)
     parser.add_argument('--seed', type=int, help='Random seed which can be used to reproduce results.', default=None)
-    parser.add_argument('--speaking_rate', type=float, help='Random seed which can be used to reproduce results.', default=1.0)
     parser.add_argument('--produce_debug_state', type=bool, help='Whether or not to produce debug_state.pth, which can aid in reproducing problems. Defaults to true.', default=True)
     parser.add_argument('--cvvp_amount', type=float, help='How much the CVVP model should influence the output.'
                                                           'Increasing this can in some cases reduce the likelihood of multiple speakers. Defaults to 0 (disabled)', default=.0)
     args = parser.parse_args()
     os.makedirs(args.output_path, exist_ok=True)
-    tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed)
+    tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed, kv_cache=args.kv_cache, half=args.half)
 
     selected_voices = args.voice.split(',')
     for k, selected_voice in enumerate(selected_voices):
@@ -35,7 +36,7 @@ if __name__ == '__main__':
             voice_sel = [selected_voice]
         voice_samples, conditioning_latents = load_voices(voice_sel)
 
-        gen, dbg_state = tts.tts_with_preset(args.text, speaking_rate=args.speaking_rate, k=args.candidates, voice_samples=voice_samples, conditioning_latents=conditioning_latents,
+        gen, dbg_state = tts.tts_with_preset(args.text, k=args.candidates, voice_samples=voice_samples, conditioning_latents=conditioning_latents,
                                   preset=args.preset, use_deterministic_seed=args.seed, return_deterministic_state=True, cvvp_amount=args.cvvp_amount)
         if isinstance(gen, list):
             for j, g in enumerate(gen):
diff --git a/tortoise/read.py b/tortoise/read.py
index 29aeabf..38f95ae 100644
--- a/tortoise/read.py
+++ b/tortoise/read.py
@@ -26,7 +26,7 @@ if __name__ == '__main__':
     parser.add_argument('--produce_debug_state', type=bool, help='Whether or not to produce debug_state.pth, which can aid in reproducing problems. Defaults to true.', default=True)
     parser.add_argument('--use_deepspeed', type=bool, help='Use deepspeed for speed bump.', default=True)
     parser.add_argument('--kv_cache', type=bool, help='If you disable this please wait for a long a time to get the output', default=True)
-    parser.add_argument('--half', type=bool, help='float16(half) precision inference if True it's faster and take less vram and ram', default=True)
+    parser.add_argument('--half', type=bool, help="float16(half) precision inference if True it's faster and take less vram and ram", default=True)
 
 
     args = parser.parse_args()