From b4988c24b3cb0f5490b0c92dbebc77c1e439f1cd Mon Sep 17 00:00:00 2001 From: Jerry-Master Date: Sun, 6 Aug 2023 17:41:30 +0200 Subject: [PATCH 1/6] Added MPS support for do_tts --- requirements.txt | 1 + tortoise/api.py | 216 +++++++++++++++++++-------- tortoise/models/autoregressive.py | 9 +- tortoise/models/diffusion_decoder.py | 5 +- tortoise/utils/audio.py | 2 +- tortoise/utils/diffusion.py | 2 +- tortoise/utils/wav2vec_alignment.py | 2 +- 7 files changed, 169 insertions(+), 68 deletions(-) diff --git a/requirements.txt b/requirements.txt index 5168c32..48df0b6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,3 +21,4 @@ pydantic==1.9.1 deepspeed==0.8.3 py-cpuinfo hjson +psutil diff --git a/tortoise/api.py b/tortoise/api.py index efa01fb..a095a88 100644 --- a/tortoise/api.py +++ b/tortoise/api.py @@ -189,6 +189,16 @@ def pick_best_batch_size_for_gpu(): return 8 elif availableGb > 7: return 4 + if torch.backends.mps.is_available(): + import psutil + available = psutil.virtual_memory().total + availableGb = available / (1024 ** 3) + if availableGb > 14: + return 16 + elif availableGb > 10: + return 8 + elif availableGb > 7: + return 4 return 1 class TextToSpeech: @@ -212,7 +222,9 @@ class TextToSpeech: self.models_dir = models_dir self.autoregressive_batch_size = pick_best_batch_size_for_gpu() if autoregressive_batch_size is None else autoregressive_batch_size self.enable_redaction = enable_redaction - self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.device = torch.device('cuda' if torch.cuda.is_available() else'cpu') + if torch.backends.mps.is_available(): + self.device = torch.device('mps') if self.enable_redaction: self.aligner = Wav2VecAlignment() @@ -254,6 +266,7 @@ class TextToSpeech: m = model.to(self.device) yield m m = model.cpu() + def load_cvvp(self): """Load CVVP model.""" @@ -410,54 +423,102 @@ class TextToSpeech: calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output" if verbose: print("Generating autoregressive samples..") - with self.temporary_cuda(self.autoregressive - ) as autoregressive, torch.autocast(device_type="cuda", dtype=torch.float16, enabled=self.half): - for b in tqdm(range(num_batches), disable=not verbose): - codes = autoregressive.inference_speech(auto_conditioning, text_tokens, - do_sample=True, - top_p=top_p, - temperature=temperature, - num_return_sequences=self.autoregressive_batch_size, - length_penalty=length_penalty, - repetition_penalty=repetition_penalty, - max_generate_length=max_mel_tokens, - **hf_generate_kwargs) - padding_needed = max_mel_tokens - codes.shape[1] - codes = F.pad(codes, (0, padding_needed), value=stop_mel_token) - samples.append(codes) + if not torch.backends.mps.is_available(): + with self.temporary_cuda(self.autoregressive + ) as autoregressive, torch.autocast(device_type="cuda", dtype=torch.float16, enabled=self.half): + for b in tqdm(range(num_batches), disable=not verbose): + codes = autoregressive.inference_speech(auto_conditioning, text_tokens, + do_sample=True, + top_p=top_p, + temperature=temperature, + num_return_sequences=self.autoregressive_batch_size, + length_penalty=length_penalty, + repetition_penalty=repetition_penalty, + max_generate_length=max_mel_tokens, + **hf_generate_kwargs) + padding_needed = max_mel_tokens - codes.shape[1] + codes = F.pad(codes, (0, padding_needed), value=stop_mel_token) + samples.append(codes) + else: + with self.temporary_cuda(self.autoregressive) as autoregressive: + for b in tqdm(range(num_batches), disable=not verbose): + codes = autoregressive.inference_speech(auto_conditioning, text_tokens, + do_sample=True, + top_p=top_p, + temperature=temperature, + num_return_sequences=self.autoregressive_batch_size, + length_penalty=length_penalty, + repetition_penalty=repetition_penalty, + max_generate_length=max_mel_tokens, + **hf_generate_kwargs) + padding_needed = max_mel_tokens - codes.shape[1] + codes = F.pad(codes, (0, padding_needed), value=stop_mel_token) + samples.append(codes) clip_results = [] - with self.temporary_cuda(self.clvp) as clvp, torch.autocast( - device_type="cuda", dtype=torch.float16, enabled=self.half - ): - if cvvp_amount > 0: - if self.cvvp is None: - self.load_cvvp() - self.cvvp = self.cvvp.to(self.device) - if verbose: - if self.cvvp is None: - print("Computing best candidates using CLVP") - else: - print(f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%") - for batch in tqdm(samples, disable=not verbose): - for i in range(batch.shape[0]): - batch[i] = fix_autoregressive_output(batch[i], stop_mel_token) - if cvvp_amount != 1: - clvp_out = clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False) - if auto_conds is not None and cvvp_amount > 0: - cvvp_accumulator = 0 - for cl in range(auto_conds.shape[1]): - cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False) - cvvp = cvvp_accumulator / auto_conds.shape[1] - if cvvp_amount == 1: - clip_results.append(cvvp) + + if not torch.backends.mps.is_available(): + with self.temporary_cuda(self.clvp) as clvp, torch.autocast( + device_type="cuda" if not torch.backends.mps.is_available() else 'mps', dtype=torch.float16, enabled=self.half + ): + if cvvp_amount > 0: + if self.cvvp is None: + self.load_cvvp() + self.cvvp = self.cvvp.to(self.device) + if verbose: + if self.cvvp is None: + print("Computing best candidates using CLVP") else: - clip_results.append(cvvp * cvvp_amount + clvp_out * (1-cvvp_amount)) - else: - clip_results.append(clvp_out) - clip_results = torch.cat(clip_results, dim=0) - samples = torch.cat(samples, dim=0) - best_results = samples[torch.topk(clip_results, k=k).indices] + print(f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%") + for batch in tqdm(samples, disable=not verbose): + for i in range(batch.shape[0]): + batch[i] = fix_autoregressive_output(batch[i], stop_mel_token) + if cvvp_amount != 1: + clvp_out = clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False) + if auto_conds is not None and cvvp_amount > 0: + cvvp_accumulator = 0 + for cl in range(auto_conds.shape[1]): + cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False) + cvvp = cvvp_accumulator / auto_conds.shape[1] + if cvvp_amount == 1: + clip_results.append(cvvp) + else: + clip_results.append(cvvp * cvvp_amount + clvp_out * (1-cvvp_amount)) + else: + clip_results.append(clvp_out) + clip_results = torch.cat(clip_results, dim=0) + samples = torch.cat(samples, dim=0) + best_results = samples[torch.topk(clip_results, k=k).indices] + else: + with self.temporary_cuda(self.clvp) as clvp: + if cvvp_amount > 0: + if self.cvvp is None: + self.load_cvvp() + self.cvvp = self.cvvp.to(self.device) + if verbose: + if self.cvvp is None: + print("Computing best candidates using CLVP") + else: + print(f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%") + for batch in tqdm(samples, disable=not verbose): + for i in range(batch.shape[0]): + batch[i] = fix_autoregressive_output(batch[i], stop_mel_token) + if cvvp_amount != 1: + clvp_out = clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False) + if auto_conds is not None and cvvp_amount > 0: + cvvp_accumulator = 0 + for cl in range(auto_conds.shape[1]): + cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False) + cvvp = cvvp_accumulator / auto_conds.shape[1] + if cvvp_amount == 1: + clip_results.append(cvvp) + else: + clip_results.append(cvvp * cvvp_amount + clvp_out * (1-cvvp_amount)) + else: + clip_results.append(clvp_out) + clip_results = torch.cat(clip_results, dim=0) + samples = torch.cat(samples, dim=0) + best_results = samples[torch.topk(clip_results, k=k).indices] if self.cvvp is not None: self.cvvp = self.cvvp.cpu() del samples @@ -465,26 +526,58 @@ class TextToSpeech: # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these # results, but will increase memory usage. - with self.temporary_cuda( - self.autoregressive - ) as autoregressive, torch.autocast( - device_type="cuda", dtype=torch.float16, enabled=self.half - ): - best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1), - torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results, - torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device), - return_latent=True, clip_inputs=False) - del auto_conditioning + if not torch.backends.mps.is_available(): + with self.temporary_cuda( + self.autoregressive + ) as autoregressive, torch.autocast( + device_type="cuda" if not torch.backends.mps.is_available() else 'mps', dtype=torch.float16, enabled=self.half + ): + best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1), + torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results, + torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device), + return_latent=True, clip_inputs=False) + del auto_conditioning + else: + with self.temporary_cuda( + self.autoregressive + ) as autoregressive: + best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1), + torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results, + torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device), + return_latent=True, clip_inputs=False) + del auto_conditioning if verbose: print("Transforming autoregressive outputs into audio..") wav_candidates = [] - with self.temporary_cuda(self.diffusion) as diffusion, self.temporary_cuda( - self.vocoder - ) as vocoder: + if not torch.backends.mps.is_available(): + with self.temporary_cuda(self.diffusion) as diffusion, self.temporary_cuda( + self.vocoder + ) as vocoder: + for b in range(best_results.shape[0]): + codes = best_results[b].unsqueeze(0) + latents = best_latents[b].unsqueeze(0) + + # Find the first occurrence of the "calm" token and trim the codes to that. + ctokens = 0 + for k in range(codes.shape[-1]): + if codes[0, k] == calm_token: + ctokens += 1 + else: + ctokens = 0 + if ctokens > 8: # 8 tokens gives the diffusion model some "breathing room" to terminate speech. + latents = latents[:, :k] + break + mel = do_spectrogram_diffusion(diffusion, diffuser, latents, diffusion_conditioning, temperature=diffusion_temperature, + verbose=verbose) + wav = vocoder.inference(mel) + wav_candidates.append(wav.cpu()) + else: + diffusion, vocoder = self.diffusion, self.vocoder + diffusion_conditioning = diffusion_conditioning.cpu() for b in range(best_results.shape[0]): - codes = best_results[b].unsqueeze(0) - latents = best_latents[b].unsqueeze(0) + codes = best_results[b].unsqueeze(0).cpu() + latents = best_latents[b].unsqueeze(0).cpu() # Find the first occurrence of the "calm" token and trim the codes to that. ctokens = 0 @@ -496,7 +589,6 @@ class TextToSpeech: if ctokens > 8: # 8 tokens gives the diffusion model some "breathing room" to terminate speech. latents = latents[:, :k] break - mel = do_spectrogram_diffusion(diffusion, diffuser, latents, diffusion_conditioning, temperature=diffusion_temperature, verbose=verbose) wav = vocoder.inference(mel) diff --git a/tortoise/models/autoregressive.py b/tortoise/models/autoregressive.py index 9a6eec9..2d01066 100644 --- a/tortoise/models/autoregressive.py +++ b/tortoise/models/autoregressive.py @@ -47,7 +47,7 @@ class GPT2InferenceModel(GPT2PreTrainedModel): self.cached_mel_emb = None def parallelize(self, device_map=None): self.device_map = ( - get_device_map(len(self.transformer.h), range(torch.cuda.device_count())) + get_device_map(len(self.transformer.h), range(max(1, torch.cuda.device_count()))) if device_map is None else device_map ) @@ -62,6 +62,8 @@ class GPT2InferenceModel(GPT2PreTrainedModel): self.lm_head = self.lm_head.to("cpu") self.model_parallel = False torch.cuda.empty_cache() + if torch.backends.mps.is_available(): + torch.mps.empty_cache() def get_output_embeddings(self): return self.lm_head @@ -162,7 +164,10 @@ class GPT2InferenceModel(GPT2PreTrainedModel): # Set device for model parallelism if self.model_parallel: - torch.cuda.set_device(self.transformer.first_device) + if torch.backends.mps.is_available(): + self.to(self.transformer.first_device) + else: + torch.cuda.set_device(self.transformer.first_device) hidden_states = hidden_states.to(self.lm_head.weight.device) lm_logits = self.lm_head(hidden_states) diff --git a/tortoise/models/diffusion_decoder.py b/tortoise/models/diffusion_decoder.py index f67d21a..e969129 100644 --- a/tortoise/models/diffusion_decoder.py +++ b/tortoise/models/diffusion_decoder.py @@ -302,7 +302,10 @@ class DiffusionTts(nn.Module): unused_params.extend(list(lyr.parameters())) else: # First and last blocks will have autocast disabled for improved precision. - with autocast(x.device.type, enabled=self.enable_fp16 and i != 0): + if not torch.backends.mps.is_available(): + with autocast(x.device.type, enabled=self.enable_fp16 and i != 0): + x = lyr(x, time_emb) + else: x = lyr(x, time_emb) x = x.float() diff --git a/tortoise/utils/audio.py b/tortoise/utils/audio.py index 91237dd..6842af5 100644 --- a/tortoise/utils/audio.py +++ b/tortoise/utils/audio.py @@ -180,7 +180,7 @@ class TacotronSTFT(torch.nn.Module): return mel_output -def wav_to_univnet_mel(wav, do_normalization=False, device='cuda'): +def wav_to_univnet_mel(wav, do_normalization=False, device='cuda' if not torch.backends.mps.is_available() else 'mps'): stft = TacotronSTFT(1024, 256, 1024, 100, 24000, 0, 12000) stft = stft.to(device) mel = stft.mel_spectrogram(wav) diff --git a/tortoise/utils/diffusion.py b/tortoise/utils/diffusion.py index e877ff2..6d4d594 100644 --- a/tortoise/utils/diffusion.py +++ b/tortoise/utils/diffusion.py @@ -1244,7 +1244,7 @@ def _extract_into_tensor(arr, timesteps, broadcast_shape): dimension equal to the length of timesteps. :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims. """ - res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float() + res = th.from_numpy(arr.astype(np.float32)).to(device=timesteps.device)[timesteps] while len(res.shape) < len(broadcast_shape): res = res[..., None] return res.expand(broadcast_shape) \ No newline at end of file diff --git a/tortoise/utils/wav2vec_alignment.py b/tortoise/utils/wav2vec_alignment.py index bbe3285..adc39e3 100644 --- a/tortoise/utils/wav2vec_alignment.py +++ b/tortoise/utils/wav2vec_alignment.py @@ -49,7 +49,7 @@ class Wav2VecAlignment: """ Uses wav2vec2 to perform audio<->text alignment. """ - def __init__(self, device='cuda'): + def __init__(self, device='cuda' if not torch.backends.mps.is_available() else 'mps'): self.model = Wav2Vec2ForCTC.from_pretrained("jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli").cpu() self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"facebook/wav2vec2-large-960h") self.tokenizer = Wav2Vec2CTCTokenizer.from_pretrained('jbetker/tacotron-symbols') From 8d67995ba7c27c52705479596a5a6fe6cb834c9e Mon Sep 17 00:00:00 2001 From: Jerry-Master Date: Sun, 6 Aug 2023 19:01:10 +0200 Subject: [PATCH 2/6] Addes MPS support --- tortoise/api.py | 2 +- tortoise/models/arch_util.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tortoise/api.py b/tortoise/api.py index a095a88..50ad6e2 100644 --- a/tortoise/api.py +++ b/tortoise/api.py @@ -100,7 +100,7 @@ def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusi conditioning_free=cond_free, conditioning_free_k=cond_free_k) -def format_conditioning(clip, cond_length=132300, device='cuda'): +def format_conditioning(clip, cond_length=132300, device="cuda" if not torch.backends.mps.is_available() else 'mps'): """ Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models. """ diff --git a/tortoise/models/arch_util.py b/tortoise/models/arch_util.py index 661ee1f..f678a02 100644 --- a/tortoise/models/arch_util.py +++ b/tortoise/models/arch_util.py @@ -319,6 +319,8 @@ class TorchMelSpectrogram(nn.Module): if len(inp.shape) == 3: # Automatically squeeze out the channels dimension if it is present (assuming mono-audio) inp = inp.squeeze(1) assert len(inp.shape) == 2 + if torch.backends.mps.is_available(): + inp = inp.to('cpu') self.mel_stft = self.mel_stft.to(inp.device) mel = self.mel_stft(inp) # Perform dynamic range compression From dee73cd703f830b2dbca55324db4265272ae4002 Mon Sep 17 00:00:00 2001 From: Jose <34888496+Jerry-Master@users.noreply.github.com> Date: Fri, 11 Aug 2023 14:42:45 +0200 Subject: [PATCH 3/6] Updated README.md --- README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/README.md b/README.md index dc311cb..989cd1f 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,27 @@ Optionally, pytorch can be installed in the base environment, so that other cond If you are on windows, you may also need to install pysoundfile: `conda install -c conda-forge pysoundfile` +## Apple Silicon + +On MacOS 13+ with M1/M2 chips you need to install the nighly version of pytorch, as stated in the official page you can do: + +```shell +pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu +``` + +Be sure to do that after you activate the environment. If you don't use conda the commands would look like this: + +```shell +python3.10 -m venv .venv +source .venv/bin/activate +pip install numba inflect +pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu +pip install transformers +git clone https://github.com/neonbjb/tortoise-tts.git +cd tortoise-tts +pip install . +``` + ### do_tts.py This script allows you to speak a single phrase with one or more voices. From bbe3c1550e9b725715947bdd39c3fccdd6371117 Mon Sep 17 00:00:00 2001 From: Jose <34888496+Jerry-Master@users.noreply.github.com> Date: Fri, 11 Aug 2023 16:34:52 +0200 Subject: [PATCH 4/6] Updated for read.py --- README.md | 3 +++ tortoise/read.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 989cd1f..f6b1725 100644 --- a/README.md +++ b/README.md @@ -108,6 +108,9 @@ cd tortoise-tts pip install . ``` +Be aware that the code does not work with DeepSpeed, try to avoid using the flag `--use_deepspeed` in the commands below. +Also, mixed precision is not supported on MPS so don't use the `--half` flag either. + ### do_tts.py This script allows you to speak a single phrase with one or more voices. diff --git a/tortoise/read.py b/tortoise/read.py index 38f95ae..ab31bae 100644 --- a/tortoise/read.py +++ b/tortoise/read.py @@ -24,7 +24,7 @@ if __name__ == '__main__': 'should only be specified if you have custom checkpoints.', default=MODELS_DIR) parser.add_argument('--seed', type=int, help='Random seed which can be used to reproduce results.', default=None) parser.add_argument('--produce_debug_state', type=bool, help='Whether or not to produce debug_state.pth, which can aid in reproducing problems. Defaults to true.', default=True) - parser.add_argument('--use_deepspeed', type=bool, help='Use deepspeed for speed bump.', default=True) + parser.add_argument('--use_deepspeed', type=bool, help='Use deepspeed for speed bump.', default=False) parser.add_argument('--kv_cache', type=bool, help='If you disable this please wait for a long a time to get the output', default=True) parser.add_argument('--half', type=bool, help="float16(half) precision inference if True it's faster and take less vram and ram", default=True) From bde2cc2bff2ddc3a2ead0e5ec0d078d16749da6d Mon Sep 17 00:00:00 2001 From: Jose <34888496+Jerry-Master@users.noreply.github.com> Date: Fri, 11 Aug 2023 16:39:09 +0200 Subject: [PATCH 5/6] Removed deepspeed for MPS --- README.md | 4 ++-- tortoise/do_tts.py | 2 ++ tortoise/read.py | 2 ++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f6b1725..8da668d 100644 --- a/README.md +++ b/README.md @@ -108,8 +108,8 @@ cd tortoise-tts pip install . ``` -Be aware that the code does not work with DeepSpeed, try to avoid using the flag `--use_deepspeed` in the commands below. -Also, mixed precision is not supported on MPS so don't use the `--half` flag either. +Be aware that DeepSpeed is disabled on Apple Silicon since it does not work. The flag `--use_deepspeed` is ignored. + ### do_tts.py diff --git a/tortoise/do_tts.py b/tortoise/do_tts.py index 39acab3..2f0e562 100644 --- a/tortoise/do_tts.py +++ b/tortoise/do_tts.py @@ -25,6 +25,8 @@ if __name__ == '__main__': parser.add_argument('--cvvp_amount', type=float, help='How much the CVVP model should influence the output.' 'Increasing this can in some cases reduce the likelihood of multiple speakers. Defaults to 0 (disabled)', default=.0) args = parser.parse_args() + if torch.backends.mps.is_available(): + args.use_deepspeed = False os.makedirs(args.output_path, exist_ok=True) tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed, kv_cache=args.kv_cache, half=args.half) diff --git a/tortoise/read.py b/tortoise/read.py index ab31bae..e5839aa 100644 --- a/tortoise/read.py +++ b/tortoise/read.py @@ -30,6 +30,8 @@ if __name__ == '__main__': args = parser.parse_args() + if torch.backends.mps.is_available(): + args.use_deepspeed = False tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed, kv_cache=args.kv_cache, half=args.half) outpath = args.output_path From 791220fd04dbacaa493f7dcadfd7cd6271383c3e Mon Sep 17 00:00:00 2001 From: Jose <34888496+Jerry-Master@users.noreply.github.com> Date: Fri, 11 Aug 2023 16:44:22 +0200 Subject: [PATCH 6/6] Added extra clarification. --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 8da668d..1390f58 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,7 @@ pip install . ``` Be aware that DeepSpeed is disabled on Apple Silicon since it does not work. The flag `--use_deepspeed` is ignored. +You may need to prepend `PYTORCH_ENABLE_MPS_FALLBACK=1` to the commands below to make them work since MPS does not support all the operations in Pytorch. ### do_tts.py