From bf7976172e7ecce1698e1c8c11176b788bb6bcd6 Mon Sep 17 00:00:00 2001
From: Simon Sardorf <ssardorf@gmail.com>
Date: Wed, 18 Dec 2024 15:44:50 +0100
Subject: [PATCH 1/2] Enable mps support

---
 tortoise/api.py      | 29 ++++++++++++++++++-----------
 tortoise/api_fast.py |  8 ++++----
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/tortoise/api.py b/tortoise/api.py
index 8a010c2..0e61ffb 100644
--- a/tortoise/api.py
+++ b/tortoise/api.py
@@ -243,7 +243,7 @@ class TextToSpeech:
         self.rlg_auto = None
         self.rlg_diffusion = None
     @contextmanager
-    def temporary_cuda(self, model):
+    def temporary_device(self, model):
         m = model.to(self.device)
         yield m
         m = model.cpu()
@@ -410,8 +410,9 @@ class TextToSpeech:
             if verbose:
                 print("Generating autoregressive samples..")
             if not torch.backends.mps.is_available():
-                with self.temporary_cuda(self.autoregressive
-                ) as autoregressive, torch.autocast(device_type="cuda", dtype=torch.float16, enabled=self.half):
+                with self.temporary_device(self.autoregressive) as autoregressive, torch.autocast(
+                    device_type="cuda", dtype=torch.float16, enabled=self.half
+                ):
                     for b in tqdm(range(num_batches), disable=not verbose):
                         codes = autoregressive.inference_speech(auto_conditioning, text_tokens,
                                                                     do_sample=True,
@@ -426,7 +427,9 @@ class TextToSpeech:
                         codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
                         samples.append(codes)
             else:
-                with self.temporary_cuda(self.autoregressive) as autoregressive:
+                with self.temporary_device(self.autoregressive) as autoregressive, torch.autocast(
+                    device_type="mps", dtype=torch.float16, enabled=self.half
+                ):
                     for b in tqdm(range(num_batches), disable=not verbose):
                         codes = autoregressive.inference_speech(auto_conditioning, text_tokens,
                                                                     do_sample=True,
@@ -444,8 +447,10 @@ class TextToSpeech:
             clip_results = []
             
             if not torch.backends.mps.is_available():
-                with self.temporary_cuda(self.clvp) as clvp, torch.autocast(
-                    device_type="cuda" if not torch.backends.mps.is_available() else 'mps', dtype=torch.float16, enabled=self.half
+                with self.temporary_device(self.clvp) as clvp, torch.autocast(
+                    device_type=self.device.type,
+                    dtype=torch.float16,
+                    enabled=self.half
                 ):
                     if cvvp_amount > 0:
                         if self.cvvp is None:
@@ -476,7 +481,7 @@ class TextToSpeech:
                     samples = torch.cat(samples, dim=0)
                     best_results = samples[torch.topk(clip_results, k=k).indices]
             else:
-                with self.temporary_cuda(self.clvp) as clvp:
+                with self.temporary_device(self.clvp) as clvp:
                     if cvvp_amount > 0:
                         if self.cvvp is None:
                             self.load_cvvp()
@@ -513,10 +518,12 @@ class TextToSpeech:
             # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
             # results, but will increase memory usage.
             if not torch.backends.mps.is_available():
-                with self.temporary_cuda(
+                with self.temporary_device(
                     self.autoregressive
                 ) as autoregressive, torch.autocast(
-                    device_type="cuda" if not torch.backends.mps.is_available() else 'mps', dtype=torch.float16, enabled=self.half
+                    device_type=self.device.type,
+                    dtype=torch.float16,
+                    enabled=self.half
                 ):
                     best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
                                                     torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results,
@@ -524,7 +531,7 @@ class TextToSpeech:
                                                     return_latent=True, clip_inputs=False)
                     del auto_conditioning
             else:
-                with self.temporary_cuda(
+                with self.temporary_device(
                     self.autoregressive
                 ) as autoregressive:
                     best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
@@ -537,7 +544,7 @@ class TextToSpeech:
                 print("Transforming autoregressive outputs into audio..")
             wav_candidates = []
             if not torch.backends.mps.is_available():
-                with self.temporary_cuda(self.diffusion) as diffusion, self.temporary_cuda(
+                with self.temporary_device(self.diffusion) as diffusion, self.temporary_device(
                     self.vocoder
                 ) as vocoder:
                     for b in range(best_results.shape[0]):
diff --git a/tortoise/api_fast.py b/tortoise/api_fast.py
index fd7c590..130a63b 100644
--- a/tortoise/api_fast.py
+++ b/tortoise/api_fast.py
@@ -371,7 +371,7 @@ class TextToSpeech:
             if verbose:
                 print("Generating autoregressive samples..")
             with torch.autocast(
-                    device_type="cuda" , dtype=torch.float16, enabled=self.half
+                    device_type="cuda" if not torch.backends.mps.is_available() else "mps" , dtype=torch.float16, enabled=self.half
                 ):
                 fake_inputs = self.autoregressive.compute_embeddings(
                     auto_conditioning,
@@ -400,7 +400,7 @@ class TextToSpeech:
             while not is_end:
                 try:
                     with torch.autocast(
-                        device_type="cuda", dtype=torch.float16, enabled=self.half
+                         device_type="cuda" if not torch.backends.mps.is_available() else "mps", dtype=torch.float16, enabled=self.half
                     ):
                         codes, latent = next(gpt_generator)
                         all_latents += [latent]
@@ -477,9 +477,9 @@ class TextToSpeech:
         with torch.no_grad():
             calm_token = 83  # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
             if verbose:
-                print("Generating autoregressive samples..")
+                print("Generating autoregressive samples..")                
             with torch.autocast(
-                    device_type="cuda" , dtype=torch.float16, enabled=self.half
+                    device_type="cuda" if not torch.backends.mps.is_available() else "mps", dtype=torch.float16, enabled=self.half
                 ):
                 codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens,
                                                             top_k=50,

From 3d419a1cfb1be8d86c95c261eb7e37bf6543094e Mon Sep 17 00:00:00 2001
From: Simon Sardorf <ssardorf@gmail.com>
Date: Wed, 18 Dec 2024 16:05:57 +0100
Subject: [PATCH 2/2] enable deepspeed on apple silicon

---
 setup.py               | 7 ++++---
 tortoise/do_tts.py     | 4 +---
 tortoise/read.py       | 2 --
 tortoise/read_fast.py  | 2 --
 tortoise/tts_stream.py | 2 --
 5 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/setup.py b/setup.py
index 807670a..890b58b 100644
--- a/setup.py
+++ b/setup.py
@@ -28,9 +28,10 @@ setuptools.setup(
         'scipy',
         'librosa',
         'transformers==4.31.0',
-        'tokenizers==0.14.0',
-        'scipy==1.13.1'
-        # 'deepspeed==0.8.3',
+        'tokenizers',
+        'scipy==1.13.1',
+        'deepspeed',
+        'py-cpuinfo'
     ],
     classifiers=[
         "Programming Language :: Python :: 3",
diff --git a/tortoise/do_tts.py b/tortoise/do_tts.py
index c6e2b17..00ed210 100644
--- a/tortoise/do_tts.py
+++ b/tortoise/do_tts.py
@@ -13,7 +13,7 @@ if __name__ == '__main__':
     parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
                                                  'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='random')
     parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='fast')
-    parser.add_argument('--use_deepspeed', type=str, help='Use deepspeed for speed bump.', default=False)
+    parser.add_argument('--use_deepspeed', type=bool, help='Use deepspeed for speed bump.', default=False)
     parser.add_argument('--kv_cache', type=bool, help='If you disable this please wait for a long a time to get the output', default=True)
     parser.add_argument('--half', type=bool, help="float16(half) precision inference if True it's faster and take less vram and ram", default=True)
     parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
@@ -25,8 +25,6 @@ if __name__ == '__main__':
     parser.add_argument('--cvvp_amount', type=float, help='How much the CVVP model should influence the output.'
                                                           'Increasing this can in some cases reduce the likelihood of multiple speakers. Defaults to 0 (disabled)', default=.0)
     args = parser.parse_args()
-    if torch.backends.mps.is_available():
-        args.use_deepspeed = False
     os.makedirs(args.output_path, exist_ok=True)
     tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed, kv_cache=args.kv_cache, half=args.half)
 
diff --git a/tortoise/read.py b/tortoise/read.py
index e5839aa..ab31bae 100644
--- a/tortoise/read.py
+++ b/tortoise/read.py
@@ -30,8 +30,6 @@ if __name__ == '__main__':
 
 
     args = parser.parse_args()
-    if torch.backends.mps.is_available():
-        args.use_deepspeed = False
     tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed, kv_cache=args.kv_cache, half=args.half)
 
     outpath = args.output_path
diff --git a/tortoise/read_fast.py b/tortoise/read_fast.py
index f2778d4..8a23e65 100644
--- a/tortoise/read_fast.py
+++ b/tortoise/read_fast.py
@@ -28,8 +28,6 @@ if __name__ == '__main__':
 
 
     args = parser.parse_args()
-    if torch.backends.mps.is_available():
-        args.use_deepspeed = False
     tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed, kv_cache=args.kv_cache, half=args.half)
 
     outpath = args.output_path
diff --git a/tortoise/tts_stream.py b/tortoise/tts_stream.py
index 94eaff5..f326692 100644
--- a/tortoise/tts_stream.py
+++ b/tortoise/tts_stream.py
@@ -37,8 +37,6 @@ if __name__ == '__main__':
 
 
     args = parser.parse_args()
-    if torch.backends.mps.is_available():
-        args.use_deepspeed = False
     tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed, kv_cache=args.kv_cache, half=args.half)
 
     outpath = args.output_path