From b4988c24b3cb0f5490b0c92dbebc77c1e439f1cd Mon Sep 17 00:00:00 2001
From: Jerry-Master <joseperez2000@hotmail.es>
Date: Sun, 6 Aug 2023 17:41:30 +0200
Subject: [PATCH 1/6] Added MPS support for do_tts

---
 requirements.txt                     |   1 +
 tortoise/api.py                      | 216 +++++++++++++++++++--------
 tortoise/models/autoregressive.py    |   9 +-
 tortoise/models/diffusion_decoder.py |   5 +-
 tortoise/utils/audio.py              |   2 +-
 tortoise/utils/diffusion.py          |   2 +-
 tortoise/utils/wav2vec_alignment.py  |   2 +-
 7 files changed, 169 insertions(+), 68 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 5168c32..48df0b6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,3 +21,4 @@ pydantic==1.9.1
 deepspeed==0.8.3
 py-cpuinfo
 hjson
+psutil
diff --git a/tortoise/api.py b/tortoise/api.py
index efa01fb..a095a88 100644
--- a/tortoise/api.py
+++ b/tortoise/api.py
@@ -189,6 +189,16 @@ def pick_best_batch_size_for_gpu():
             return 8
         elif availableGb > 7:
             return 4
+    if torch.backends.mps.is_available():
+        import psutil
+        available = psutil.virtual_memory().total
+        availableGb = available / (1024 ** 3)
+        if availableGb > 14:
+            return 16
+        elif availableGb > 10:
+            return 8
+        elif availableGb > 7:
+            return 4
     return 1
 
 class TextToSpeech:
@@ -212,7 +222,9 @@ class TextToSpeech:
         self.models_dir = models_dir
         self.autoregressive_batch_size = pick_best_batch_size_for_gpu() if autoregressive_batch_size is None else autoregressive_batch_size
         self.enable_redaction = enable_redaction
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.device = torch.device('cuda' if torch.cuda.is_available() else'cpu')
+        if torch.backends.mps.is_available():
+            self.device = torch.device('mps')
         if self.enable_redaction:
             self.aligner = Wav2VecAlignment()
 
@@ -254,6 +266,7 @@ class TextToSpeech:
         m = model.to(self.device)
         yield m
         m = model.cpu()
+
     
     def load_cvvp(self):
         """Load CVVP model."""
@@ -410,54 +423,102 @@ class TextToSpeech:
             calm_token = 83  # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
             if verbose:
                 print("Generating autoregressive samples..")
-            with self.temporary_cuda(self.autoregressive
-            ) as autoregressive, torch.autocast(device_type="cuda", dtype=torch.float16, enabled=self.half):
-                for b in tqdm(range(num_batches), disable=not verbose):
-                    codes = autoregressive.inference_speech(auto_conditioning, text_tokens,
-                                                                do_sample=True,
-                                                                top_p=top_p,
-                                                                temperature=temperature,
-                                                                num_return_sequences=self.autoregressive_batch_size,
-                                                                length_penalty=length_penalty,
-                                                                repetition_penalty=repetition_penalty,
-                                                                max_generate_length=max_mel_tokens,
-                                                                **hf_generate_kwargs)
-                    padding_needed = max_mel_tokens - codes.shape[1]
-                    codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
-                    samples.append(codes)
+            if not torch.backends.mps.is_available():
+                with self.temporary_cuda(self.autoregressive
+                ) as autoregressive, torch.autocast(device_type="cuda", dtype=torch.float16, enabled=self.half):
+                    for b in tqdm(range(num_batches), disable=not verbose):
+                        codes = autoregressive.inference_speech(auto_conditioning, text_tokens,
+                                                                    do_sample=True,
+                                                                    top_p=top_p,
+                                                                    temperature=temperature,
+                                                                    num_return_sequences=self.autoregressive_batch_size,
+                                                                    length_penalty=length_penalty,
+                                                                    repetition_penalty=repetition_penalty,
+                                                                    max_generate_length=max_mel_tokens,
+                                                                    **hf_generate_kwargs)
+                        padding_needed = max_mel_tokens - codes.shape[1]
+                        codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
+                        samples.append(codes)
+            else:
+                with self.temporary_cuda(self.autoregressive) as autoregressive:
+                    for b in tqdm(range(num_batches), disable=not verbose):
+                        codes = autoregressive.inference_speech(auto_conditioning, text_tokens,
+                                                                    do_sample=True,
+                                                                    top_p=top_p,
+                                                                    temperature=temperature,
+                                                                    num_return_sequences=self.autoregressive_batch_size,
+                                                                    length_penalty=length_penalty,
+                                                                    repetition_penalty=repetition_penalty,
+                                                                    max_generate_length=max_mel_tokens,
+                                                                    **hf_generate_kwargs)
+                        padding_needed = max_mel_tokens - codes.shape[1]
+                        codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
+                        samples.append(codes)
 
             clip_results = []
-            with self.temporary_cuda(self.clvp) as clvp, torch.autocast(
-                device_type="cuda", dtype=torch.float16, enabled=self.half
-            ):
-                if cvvp_amount > 0:
-                    if self.cvvp is None:
-                        self.load_cvvp()
-                    self.cvvp = self.cvvp.to(self.device)
-                if verbose:
-                    if self.cvvp is None:
-                        print("Computing best candidates using CLVP")
-                    else:
-                        print(f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%")
-                for batch in tqdm(samples, disable=not verbose):
-                    for i in range(batch.shape[0]):
-                        batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
-                    if cvvp_amount != 1:
-                        clvp_out = clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False)
-                    if auto_conds is not None and cvvp_amount > 0:
-                        cvvp_accumulator = 0
-                        for cl in range(auto_conds.shape[1]):
-                            cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False)
-                        cvvp = cvvp_accumulator / auto_conds.shape[1]
-                        if cvvp_amount == 1:
-                            clip_results.append(cvvp)
+            
+            if not torch.backends.mps.is_available():
+                with self.temporary_cuda(self.clvp) as clvp, torch.autocast(
+                    device_type="cuda" if not torch.backends.mps.is_available() else 'mps', dtype=torch.float16, enabled=self.half
+                ):
+                    if cvvp_amount > 0:
+                        if self.cvvp is None:
+                            self.load_cvvp()
+                        self.cvvp = self.cvvp.to(self.device)
+                    if verbose:
+                        if self.cvvp is None:
+                            print("Computing best candidates using CLVP")
                         else:
-                            clip_results.append(cvvp * cvvp_amount + clvp_out * (1-cvvp_amount))
-                    else:
-                        clip_results.append(clvp_out)
-                clip_results = torch.cat(clip_results, dim=0)
-                samples = torch.cat(samples, dim=0)
-                best_results = samples[torch.topk(clip_results, k=k).indices]
+                            print(f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%")
+                    for batch in tqdm(samples, disable=not verbose):
+                        for i in range(batch.shape[0]):
+                            batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
+                        if cvvp_amount != 1:
+                            clvp_out = clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False)
+                        if auto_conds is not None and cvvp_amount > 0:
+                            cvvp_accumulator = 0
+                            for cl in range(auto_conds.shape[1]):
+                                cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False)
+                            cvvp = cvvp_accumulator / auto_conds.shape[1]
+                            if cvvp_amount == 1:
+                                clip_results.append(cvvp)
+                            else:
+                                clip_results.append(cvvp * cvvp_amount + clvp_out * (1-cvvp_amount))
+                        else:
+                            clip_results.append(clvp_out)
+                    clip_results = torch.cat(clip_results, dim=0)
+                    samples = torch.cat(samples, dim=0)
+                    best_results = samples[torch.topk(clip_results, k=k).indices]
+            else:
+                with self.temporary_cuda(self.clvp) as clvp:
+                    if cvvp_amount > 0:
+                        if self.cvvp is None:
+                            self.load_cvvp()
+                        self.cvvp = self.cvvp.to(self.device)
+                    if verbose:
+                        if self.cvvp is None:
+                            print("Computing best candidates using CLVP")
+                        else:
+                            print(f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%")
+                    for batch in tqdm(samples, disable=not verbose):
+                        for i in range(batch.shape[0]):
+                            batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
+                        if cvvp_amount != 1:
+                            clvp_out = clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False)
+                        if auto_conds is not None and cvvp_amount > 0:
+                            cvvp_accumulator = 0
+                            for cl in range(auto_conds.shape[1]):
+                                cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False)
+                            cvvp = cvvp_accumulator / auto_conds.shape[1]
+                            if cvvp_amount == 1:
+                                clip_results.append(cvvp)
+                            else:
+                                clip_results.append(cvvp * cvvp_amount + clvp_out * (1-cvvp_amount))
+                        else:
+                            clip_results.append(clvp_out)
+                    clip_results = torch.cat(clip_results, dim=0)
+                    samples = torch.cat(samples, dim=0)
+                    best_results = samples[torch.topk(clip_results, k=k).indices]
             if self.cvvp is not None:
                 self.cvvp = self.cvvp.cpu()
             del samples
@@ -465,26 +526,58 @@ class TextToSpeech:
             # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
             # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
             # results, but will increase memory usage.
-            with self.temporary_cuda(
-                self.autoregressive
-            ) as autoregressive, torch.autocast(
-                device_type="cuda", dtype=torch.float16, enabled=self.half
-            ):
-                best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
-                                                torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results,
-                                                torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
-                                                return_latent=True, clip_inputs=False)
-                del auto_conditioning
+            if not torch.backends.mps.is_available():
+                with self.temporary_cuda(
+                    self.autoregressive
+                ) as autoregressive, torch.autocast(
+                    device_type="cuda" if not torch.backends.mps.is_available() else 'mps', dtype=torch.float16, enabled=self.half
+                ):
+                    best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
+                                                    torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results,
+                                                    torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
+                                                    return_latent=True, clip_inputs=False)
+                    del auto_conditioning
+            else:
+                with self.temporary_cuda(
+                    self.autoregressive
+                ) as autoregressive:
+                    best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
+                                                    torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results,
+                                                    torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
+                                                    return_latent=True, clip_inputs=False)
+                    del auto_conditioning
 
             if verbose:
                 print("Transforming autoregressive outputs into audio..")
             wav_candidates = []
-            with self.temporary_cuda(self.diffusion) as diffusion, self.temporary_cuda(
-                self.vocoder
-            ) as vocoder:
+            if not torch.backends.mps.is_available():
+                with self.temporary_cuda(self.diffusion) as diffusion, self.temporary_cuda(
+                    self.vocoder
+                ) as vocoder:
+                    for b in range(best_results.shape[0]):
+                        codes = best_results[b].unsqueeze(0)
+                        latents = best_latents[b].unsqueeze(0)
+
+                        # Find the first occurrence of the "calm" token and trim the codes to that.
+                        ctokens = 0
+                        for k in range(codes.shape[-1]):
+                            if codes[0, k] == calm_token:
+                                ctokens += 1
+                            else:
+                                ctokens = 0
+                            if ctokens > 8:  # 8 tokens gives the diffusion model some "breathing room" to terminate speech.
+                                latents = latents[:, :k]
+                                break
+                        mel = do_spectrogram_diffusion(diffusion, diffuser, latents, diffusion_conditioning, temperature=diffusion_temperature, 
+                                                    verbose=verbose)
+                        wav = vocoder.inference(mel)
+                        wav_candidates.append(wav.cpu())
+            else:
+                diffusion, vocoder = self.diffusion, self.vocoder
+                diffusion_conditioning = diffusion_conditioning.cpu()
                 for b in range(best_results.shape[0]):
-                    codes = best_results[b].unsqueeze(0)
-                    latents = best_latents[b].unsqueeze(0)
+                    codes = best_results[b].unsqueeze(0).cpu()
+                    latents = best_latents[b].unsqueeze(0).cpu()
 
                     # Find the first occurrence of the "calm" token and trim the codes to that.
                     ctokens = 0
@@ -496,7 +589,6 @@ class TextToSpeech:
                         if ctokens > 8:  # 8 tokens gives the diffusion model some "breathing room" to terminate speech.
                             latents = latents[:, :k]
                             break
-
                     mel = do_spectrogram_diffusion(diffusion, diffuser, latents, diffusion_conditioning, temperature=diffusion_temperature, 
                                                 verbose=verbose)
                     wav = vocoder.inference(mel)
diff --git a/tortoise/models/autoregressive.py b/tortoise/models/autoregressive.py
index 9a6eec9..2d01066 100644
--- a/tortoise/models/autoregressive.py
+++ b/tortoise/models/autoregressive.py
@@ -47,7 +47,7 @@ class GPT2InferenceModel(GPT2PreTrainedModel):
         self.cached_mel_emb = None
     def parallelize(self, device_map=None):
         self.device_map = (
-            get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
+            get_device_map(len(self.transformer.h), range(max(1, torch.cuda.device_count())))
             if device_map is None
             else device_map
         )
@@ -62,6 +62,8 @@ class GPT2InferenceModel(GPT2PreTrainedModel):
         self.lm_head = self.lm_head.to("cpu")
         self.model_parallel = False
         torch.cuda.empty_cache()
+        if torch.backends.mps.is_available():
+            torch.mps.empty_cache()
     
     def get_output_embeddings(self):
         return self.lm_head
@@ -162,7 +164,10 @@ class GPT2InferenceModel(GPT2PreTrainedModel):
 
         # Set device for model parallelism
         if self.model_parallel:
-            torch.cuda.set_device(self.transformer.first_device)
+            if torch.backends.mps.is_available():
+                self.to(self.transformer.first_device)
+            else:
+                torch.cuda.set_device(self.transformer.first_device)
             hidden_states = hidden_states.to(self.lm_head.weight.device)
 
         lm_logits = self.lm_head(hidden_states)
diff --git a/tortoise/models/diffusion_decoder.py b/tortoise/models/diffusion_decoder.py
index f67d21a..e969129 100644
--- a/tortoise/models/diffusion_decoder.py
+++ b/tortoise/models/diffusion_decoder.py
@@ -302,7 +302,10 @@ class DiffusionTts(nn.Module):
                 unused_params.extend(list(lyr.parameters()))
             else:
                 # First and last blocks will have autocast disabled for improved precision.
-                with autocast(x.device.type, enabled=self.enable_fp16 and i != 0):
+                if not torch.backends.mps.is_available():
+                    with autocast(x.device.type, enabled=self.enable_fp16 and i != 0):
+                        x = lyr(x, time_emb)
+                else:
                     x = lyr(x, time_emb)
 
         x = x.float()
diff --git a/tortoise/utils/audio.py b/tortoise/utils/audio.py
index 91237dd..6842af5 100644
--- a/tortoise/utils/audio.py
+++ b/tortoise/utils/audio.py
@@ -180,7 +180,7 @@ class TacotronSTFT(torch.nn.Module):
         return mel_output
 
 
-def wav_to_univnet_mel(wav, do_normalization=False, device='cuda'):
+def wav_to_univnet_mel(wav, do_normalization=False, device='cuda' if not torch.backends.mps.is_available() else 'mps'):
     stft = TacotronSTFT(1024, 256, 1024, 100, 24000, 0, 12000)
     stft = stft.to(device)
     mel = stft.mel_spectrogram(wav)
diff --git a/tortoise/utils/diffusion.py b/tortoise/utils/diffusion.py
index e877ff2..6d4d594 100644
--- a/tortoise/utils/diffusion.py
+++ b/tortoise/utils/diffusion.py
@@ -1244,7 +1244,7 @@ def _extract_into_tensor(arr, timesteps, broadcast_shape):
                             dimension equal to the length of timesteps.
     :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
     """
-    res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
+    res = th.from_numpy(arr.astype(np.float32)).to(device=timesteps.device)[timesteps]
     while len(res.shape) < len(broadcast_shape):
         res = res[..., None]
     return res.expand(broadcast_shape)
\ No newline at end of file
diff --git a/tortoise/utils/wav2vec_alignment.py b/tortoise/utils/wav2vec_alignment.py
index bbe3285..adc39e3 100644
--- a/tortoise/utils/wav2vec_alignment.py
+++ b/tortoise/utils/wav2vec_alignment.py
@@ -49,7 +49,7 @@ class Wav2VecAlignment:
     """
     Uses wav2vec2 to perform audio<->text alignment.
     """
-    def __init__(self, device='cuda'):
+    def __init__(self, device='cuda' if not torch.backends.mps.is_available() else 'mps'):
         self.model = Wav2Vec2ForCTC.from_pretrained("jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli").cpu()
         self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"facebook/wav2vec2-large-960h")
         self.tokenizer = Wav2Vec2CTCTokenizer.from_pretrained('jbetker/tacotron-symbols')

From 8d67995ba7c27c52705479596a5a6fe6cb834c9e Mon Sep 17 00:00:00 2001
From: Jerry-Master <joseperez2000@hotmail.es>
Date: Sun, 6 Aug 2023 19:01:10 +0200
Subject: [PATCH 2/6] Addes MPS support

---
 tortoise/api.py              | 2 +-
 tortoise/models/arch_util.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/tortoise/api.py b/tortoise/api.py
index a095a88..50ad6e2 100644
--- a/tortoise/api.py
+++ b/tortoise/api.py
@@ -100,7 +100,7 @@ def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusi
                            conditioning_free=cond_free, conditioning_free_k=cond_free_k)
 
 
-def format_conditioning(clip, cond_length=132300, device='cuda'):
+def format_conditioning(clip, cond_length=132300, device="cuda" if not torch.backends.mps.is_available() else 'mps'):
     """
     Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models.
     """
diff --git a/tortoise/models/arch_util.py b/tortoise/models/arch_util.py
index 661ee1f..f678a02 100644
--- a/tortoise/models/arch_util.py
+++ b/tortoise/models/arch_util.py
@@ -319,6 +319,8 @@ class TorchMelSpectrogram(nn.Module):
         if len(inp.shape) == 3:  # Automatically squeeze out the channels dimension if it is present (assuming mono-audio)
             inp = inp.squeeze(1)
         assert len(inp.shape) == 2
+        if torch.backends.mps.is_available():
+            inp = inp.to('cpu')
         self.mel_stft = self.mel_stft.to(inp.device)
         mel = self.mel_stft(inp)
         # Perform dynamic range compression

From dee73cd703f830b2dbca55324db4265272ae4002 Mon Sep 17 00:00:00 2001
From: Jose <34888496+Jerry-Master@users.noreply.github.com>
Date: Fri, 11 Aug 2023 14:42:45 +0200
Subject: [PATCH 3/6] Updated README.md

---
 README.md | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/README.md b/README.md
index dc311cb..989cd1f 100644
--- a/README.md
+++ b/README.md
@@ -87,6 +87,27 @@ Optionally, pytorch can be installed in the base environment, so that other cond
 
 If you are on windows, you may also need to install pysoundfile: `conda install -c conda-forge pysoundfile`
 
+## Apple Silicon
+
+On MacOS 13+ with M1/M2 chips you need to install the nighly version of pytorch, as stated in the official page you can do:
+
+```shell
+pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
+```
+
+Be sure to do that after you activate the environment. If you don't use conda the commands would look like this:
+
+```shell
+python3.10 -m venv .venv
+source .venv/bin/activate
+pip install numba inflect
+pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
+pip install transformers
+git clone https://github.com/neonbjb/tortoise-tts.git
+cd tortoise-tts
+pip install .
+```
+
 ### do_tts.py
 
 This script allows you to speak a single phrase with one or more voices.

From bbe3c1550e9b725715947bdd39c3fccdd6371117 Mon Sep 17 00:00:00 2001
From: Jose <34888496+Jerry-Master@users.noreply.github.com>
Date: Fri, 11 Aug 2023 16:34:52 +0200
Subject: [PATCH 4/6] Updated for read.py

---
 README.md        | 3 +++
 tortoise/read.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 989cd1f..f6b1725 100644
--- a/README.md
+++ b/README.md
@@ -108,6 +108,9 @@ cd tortoise-tts
 pip install .
 ```
 
+Be aware that the code does not work with DeepSpeed, try to avoid using the flag `--use_deepspeed` in the commands below.
+Also, mixed precision is not supported on MPS so don't use the `--half` flag either.
+
 ### do_tts.py
 
 This script allows you to speak a single phrase with one or more voices.
diff --git a/tortoise/read.py b/tortoise/read.py
index 38f95ae..ab31bae 100644
--- a/tortoise/read.py
+++ b/tortoise/read.py
@@ -24,7 +24,7 @@ if __name__ == '__main__':
                                                       'should only be specified if you have custom checkpoints.', default=MODELS_DIR)
     parser.add_argument('--seed', type=int, help='Random seed which can be used to reproduce results.', default=None)
     parser.add_argument('--produce_debug_state', type=bool, help='Whether or not to produce debug_state.pth, which can aid in reproducing problems. Defaults to true.', default=True)
-    parser.add_argument('--use_deepspeed', type=bool, help='Use deepspeed for speed bump.', default=True)
+    parser.add_argument('--use_deepspeed', type=bool, help='Use deepspeed for speed bump.', default=False)
     parser.add_argument('--kv_cache', type=bool, help='If you disable this please wait for a long a time to get the output', default=True)
     parser.add_argument('--half', type=bool, help="float16(half) precision inference if True it's faster and take less vram and ram", default=True)
 

From bde2cc2bff2ddc3a2ead0e5ec0d078d16749da6d Mon Sep 17 00:00:00 2001
From: Jose <34888496+Jerry-Master@users.noreply.github.com>
Date: Fri, 11 Aug 2023 16:39:09 +0200
Subject: [PATCH 5/6] Removed deepspeed for MPS

---
 README.md          | 4 ++--
 tortoise/do_tts.py | 2 ++
 tortoise/read.py   | 2 ++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f6b1725..8da668d 100644
--- a/README.md
+++ b/README.md
@@ -108,8 +108,8 @@ cd tortoise-tts
 pip install .
 ```
 
-Be aware that the code does not work with DeepSpeed, try to avoid using the flag `--use_deepspeed` in the commands below.
-Also, mixed precision is not supported on MPS so don't use the `--half` flag either.
+Be aware that DeepSpeed is disabled on Apple Silicon since it does not work. The flag `--use_deepspeed` is ignored.
+
 
 ### do_tts.py
 
diff --git a/tortoise/do_tts.py b/tortoise/do_tts.py
index 39acab3..2f0e562 100644
--- a/tortoise/do_tts.py
+++ b/tortoise/do_tts.py
@@ -25,6 +25,8 @@ if __name__ == '__main__':
     parser.add_argument('--cvvp_amount', type=float, help='How much the CVVP model should influence the output.'
                                                           'Increasing this can in some cases reduce the likelihood of multiple speakers. Defaults to 0 (disabled)', default=.0)
     args = parser.parse_args()
+    if torch.backends.mps.is_available():
+        args.use_deepspeed = False
     os.makedirs(args.output_path, exist_ok=True)
     tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed, kv_cache=args.kv_cache, half=args.half)
 
diff --git a/tortoise/read.py b/tortoise/read.py
index ab31bae..e5839aa 100644
--- a/tortoise/read.py
+++ b/tortoise/read.py
@@ -30,6 +30,8 @@ if __name__ == '__main__':
 
 
     args = parser.parse_args()
+    if torch.backends.mps.is_available():
+        args.use_deepspeed = False
     tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed, kv_cache=args.kv_cache, half=args.half)
 
     outpath = args.output_path

From 791220fd04dbacaa493f7dcadfd7cd6271383c3e Mon Sep 17 00:00:00 2001
From: Jose <34888496+Jerry-Master@users.noreply.github.com>
Date: Fri, 11 Aug 2023 16:44:22 +0200
Subject: [PATCH 6/6] Added extra clarification.

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 8da668d..1390f58 100644
--- a/README.md
+++ b/README.md
@@ -109,6 +109,7 @@ pip install .
 ```
 
 Be aware that DeepSpeed is disabled on Apple Silicon since it does not work. The flag `--use_deepspeed` is ignored.
+You may need to prepend `PYTORCH_ENABLE_MPS_FALLBACK=1` to the commands below to make them work since MPS does not support all the operations in Pytorch.
 
 
 ### do_tts.py