From bf7976172e7ecce1698e1c8c11176b788bb6bcd6 Mon Sep 17 00:00:00 2001 From: Simon Sardorf Date: Wed, 18 Dec 2024 15:44:50 +0100 Subject: [PATCH 1/2] Enable mps support --- tortoise/api.py | 29 ++++++++++++++++++----------- tortoise/api_fast.py | 8 ++++---- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/tortoise/api.py b/tortoise/api.py index 8a010c2..0e61ffb 100644 --- a/tortoise/api.py +++ b/tortoise/api.py @@ -243,7 +243,7 @@ class TextToSpeech: self.rlg_auto = None self.rlg_diffusion = None @contextmanager - def temporary_cuda(self, model): + def temporary_device(self, model): m = model.to(self.device) yield m m = model.cpu() @@ -410,8 +410,9 @@ class TextToSpeech: if verbose: print("Generating autoregressive samples..") if not torch.backends.mps.is_available(): - with self.temporary_cuda(self.autoregressive - ) as autoregressive, torch.autocast(device_type="cuda", dtype=torch.float16, enabled=self.half): + with self.temporary_device(self.autoregressive) as autoregressive, torch.autocast( + device_type="cuda", dtype=torch.float16, enabled=self.half + ): for b in tqdm(range(num_batches), disable=not verbose): codes = autoregressive.inference_speech(auto_conditioning, text_tokens, do_sample=True, @@ -426,7 +427,9 @@ class TextToSpeech: codes = F.pad(codes, (0, padding_needed), value=stop_mel_token) samples.append(codes) else: - with self.temporary_cuda(self.autoregressive) as autoregressive: + with self.temporary_device(self.autoregressive) as autoregressive, torch.autocast( + device_type="mps", dtype=torch.float16, enabled=self.half + ): for b in tqdm(range(num_batches), disable=not verbose): codes = autoregressive.inference_speech(auto_conditioning, text_tokens, do_sample=True, @@ -444,8 +447,10 @@ class TextToSpeech: clip_results = [] if not torch.backends.mps.is_available(): - with self.temporary_cuda(self.clvp) as clvp, torch.autocast( - device_type="cuda" if not torch.backends.mps.is_available() else 'mps', dtype=torch.float16, enabled=self.half + with self.temporary_device(self.clvp) as clvp, torch.autocast( + device_type=self.device.type, + dtype=torch.float16, + enabled=self.half ): if cvvp_amount > 0: if self.cvvp is None: @@ -476,7 +481,7 @@ class TextToSpeech: samples = torch.cat(samples, dim=0) best_results = samples[torch.topk(clip_results, k=k).indices] else: - with self.temporary_cuda(self.clvp) as clvp: + with self.temporary_device(self.clvp) as clvp: if cvvp_amount > 0: if self.cvvp is None: self.load_cvvp() @@ -513,10 +518,12 @@ class TextToSpeech: # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these # results, but will increase memory usage. if not torch.backends.mps.is_available(): - with self.temporary_cuda( + with self.temporary_device( self.autoregressive ) as autoregressive, torch.autocast( - device_type="cuda" if not torch.backends.mps.is_available() else 'mps', dtype=torch.float16, enabled=self.half + device_type=self.device.type, + dtype=torch.float16, + enabled=self.half ): best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1), torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results, @@ -524,7 +531,7 @@ class TextToSpeech: return_latent=True, clip_inputs=False) del auto_conditioning else: - with self.temporary_cuda( + with self.temporary_device( self.autoregressive ) as autoregressive: best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1), @@ -537,7 +544,7 @@ class TextToSpeech: print("Transforming autoregressive outputs into audio..") wav_candidates = [] if not torch.backends.mps.is_available(): - with self.temporary_cuda(self.diffusion) as diffusion, self.temporary_cuda( + with self.temporary_device(self.diffusion) as diffusion, self.temporary_device( self.vocoder ) as vocoder: for b in range(best_results.shape[0]): diff --git a/tortoise/api_fast.py b/tortoise/api_fast.py index fd7c590..130a63b 100644 --- a/tortoise/api_fast.py +++ b/tortoise/api_fast.py @@ -371,7 +371,7 @@ class TextToSpeech: if verbose: print("Generating autoregressive samples..") with torch.autocast( - device_type="cuda" , dtype=torch.float16, enabled=self.half + device_type="cuda" if not torch.backends.mps.is_available() else "mps" , dtype=torch.float16, enabled=self.half ): fake_inputs = self.autoregressive.compute_embeddings( auto_conditioning, @@ -400,7 +400,7 @@ class TextToSpeech: while not is_end: try: with torch.autocast( - device_type="cuda", dtype=torch.float16, enabled=self.half + device_type="cuda" if not torch.backends.mps.is_available() else "mps", dtype=torch.float16, enabled=self.half ): codes, latent = next(gpt_generator) all_latents += [latent] @@ -477,9 +477,9 @@ class TextToSpeech: with torch.no_grad(): calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output" if verbose: - print("Generating autoregressive samples..") + print("Generating autoregressive samples..") with torch.autocast( - device_type="cuda" , dtype=torch.float16, enabled=self.half + device_type="cuda" if not torch.backends.mps.is_available() else "mps", dtype=torch.float16, enabled=self.half ): codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens, top_k=50, From 3d419a1cfb1be8d86c95c261eb7e37bf6543094e Mon Sep 17 00:00:00 2001 From: Simon Sardorf Date: Wed, 18 Dec 2024 16:05:57 +0100 Subject: [PATCH 2/2] enable deepspeed on apple silicon --- setup.py | 7 ++++--- tortoise/do_tts.py | 4 +--- tortoise/read.py | 2 -- tortoise/read_fast.py | 2 -- tortoise/tts_stream.py | 2 -- 5 files changed, 5 insertions(+), 12 deletions(-) diff --git a/setup.py b/setup.py index 807670a..890b58b 100644 --- a/setup.py +++ b/setup.py @@ -28,9 +28,10 @@ setuptools.setup( 'scipy', 'librosa', 'transformers==4.31.0', - 'tokenizers==0.14.0', - 'scipy==1.13.1' - # 'deepspeed==0.8.3', + 'tokenizers', + 'scipy==1.13.1', + 'deepspeed', + 'py-cpuinfo' ], classifiers=[ "Programming Language :: Python :: 3", diff --git a/tortoise/do_tts.py b/tortoise/do_tts.py index c6e2b17..00ed210 100644 --- a/tortoise/do_tts.py +++ b/tortoise/do_tts.py @@ -13,7 +13,7 @@ if __name__ == '__main__': parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) ' 'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='random') parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='fast') - parser.add_argument('--use_deepspeed', type=str, help='Use deepspeed for speed bump.', default=False) + parser.add_argument('--use_deepspeed', type=bool, help='Use deepspeed for speed bump.', default=False) parser.add_argument('--kv_cache', type=bool, help='If you disable this please wait for a long a time to get the output', default=True) parser.add_argument('--half', type=bool, help="float16(half) precision inference if True it's faster and take less vram and ram", default=True) parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/') @@ -25,8 +25,6 @@ if __name__ == '__main__': parser.add_argument('--cvvp_amount', type=float, help='How much the CVVP model should influence the output.' 'Increasing this can in some cases reduce the likelihood of multiple speakers. Defaults to 0 (disabled)', default=.0) args = parser.parse_args() - if torch.backends.mps.is_available(): - args.use_deepspeed = False os.makedirs(args.output_path, exist_ok=True) tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed, kv_cache=args.kv_cache, half=args.half) diff --git a/tortoise/read.py b/tortoise/read.py index e5839aa..ab31bae 100644 --- a/tortoise/read.py +++ b/tortoise/read.py @@ -30,8 +30,6 @@ if __name__ == '__main__': args = parser.parse_args() - if torch.backends.mps.is_available(): - args.use_deepspeed = False tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed, kv_cache=args.kv_cache, half=args.half) outpath = args.output_path diff --git a/tortoise/read_fast.py b/tortoise/read_fast.py index f2778d4..8a23e65 100644 --- a/tortoise/read_fast.py +++ b/tortoise/read_fast.py @@ -28,8 +28,6 @@ if __name__ == '__main__': args = parser.parse_args() - if torch.backends.mps.is_available(): - args.use_deepspeed = False tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed, kv_cache=args.kv_cache, half=args.half) outpath = args.output_path diff --git a/tortoise/tts_stream.py b/tortoise/tts_stream.py index 94eaff5..f326692 100644 --- a/tortoise/tts_stream.py +++ b/tortoise/tts_stream.py @@ -37,8 +37,6 @@ if __name__ == '__main__': args = parser.parse_args() - if torch.backends.mps.is_available(): - args.use_deepspeed = False tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed, kv_cache=args.kv_cache, half=args.half) outpath = args.output_path