diff --git a/requirements.txt b/requirements.txt
index 3a7bf67..031e3d5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,3 +23,4 @@ py-cpuinfo
 hjson
 psutil
 sounddevice
+torch-directml
diff --git a/scripts/tortoise_tts.py b/scripts/tortoise_tts.py
old mode 100755
new mode 100644
diff --git a/tortoise/api.py b/tortoise/api.py
index 69807b1..5c4b233 100644
--- a/tortoise/api.py
+++ b/tortoise/api.py
@@ -4,7 +4,7 @@ import uuid
 from time import time
 from urllib import request
 
-import torch
+import torch, torch_directml
 import torch.nn.functional as F
 import progressbar
 import torchaudio
@@ -70,10 +70,18 @@ def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusi
                            conditioning_free=cond_free, conditioning_free_k=cond_free_k)
 
 
-def format_conditioning(clip, cond_length=132300, device="cuda" if not torch.backends.mps.is_available() else 'mps'):
+def format_conditioning(clip, cond_length=132300, device=''):
     """
     Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models.
     """
+
+    if device == '':
+        device = 'cpu'
+        if torch.cuda.is_available():
+            device = 'cuda'
+        elif torch.backends.mps.is_available():
+            device = 'mps'
+    
     gap = clip.shape[-1] - cond_length
     if gap < 0:
         clip = F.pad(clip, pad=(0, abs(gap)))
@@ -169,6 +177,11 @@ def pick_best_batch_size_for_gpu():
             return 8
         elif availableGb > 7:
             return 4
+    
+    # DirectML is available, but we don't know how much memory is available.
+    if torch_directml.is_available():
+        return 16
+    
     return 1
 
 class TextToSpeech:
@@ -194,9 +207,16 @@ class TextToSpeech:
         self.models_dir = models_dir
         self.autoregressive_batch_size = pick_best_batch_size_for_gpu() if autoregressive_batch_size is None else autoregressive_batch_size
         self.enable_redaction = enable_redaction
-        self.device = torch.device('cuda' if torch.cuda.is_available() else'cpu')
-        if torch.backends.mps.is_available():
+        
+        if torch.cuda.is_available():
+            self.device = torch.device('cuda')
+        elif torch.backends.mps.is_available():
             self.device = torch.device('mps')
+        elif torch_directml.is_available():
+            self.device = torch_directml.device(0)
+        else:
+            self.device = torch.device('cpu')
+        
         if self.enable_redaction:
             self.aligner = Wav2VecAlignment()
 
@@ -240,7 +260,7 @@ class TextToSpeech:
     def temporary_cuda(self, model):
         m = model.to(self.device)
         yield m
-        m = model.cpu()
+        # m = model.cpu()
 
     
     def load_cvvp(self):
@@ -379,6 +399,7 @@ class TextToSpeech:
         text_tokens = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0).to(self.device)
         text_tokens = F.pad(text_tokens, (0, 1))  # This may not be necessary.
         assert text_tokens.shape[-1] < 400, 'Too much text provided. Break the text up into separate segments and re-try inference.'
+
         auto_conds = None
         if voice_samples is not None:
             auto_conditioning, diffusion_conditioning, auto_conds, _ = self.get_conditioning_latents(voice_samples, return_mels=True)
@@ -398,7 +419,13 @@ class TextToSpeech:
             calm_token = 83  # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
             if verbose:
                 print("Generating autoregressive samples..")
-            if not torch.backends.mps.is_available():
+                print(f"CUDA Available: \t{torch.cuda.is_available()}")
+                print(f"MPS Available: \t\t{torch.backends.mps.is_available()}")
+                print(f"DirectML Available: \t{torch_directml.is_available()}")
+                print(f"Autoregressive Batch Size: {self.autoregressive_batch_size}")
+
+            # CUDA
+            if torch.cuda.is_available():
                 with self.temporary_cuda(self.autoregressive
                 ) as autoregressive, torch.autocast(device_type="cuda", dtype=torch.float16, enabled=self.half):
                     for b in tqdm(range(num_batches), disable=not verbose):
@@ -414,6 +441,27 @@ class TextToSpeech:
                         padding_needed = max_mel_tokens - codes.shape[1]
                         codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
                         samples.append(codes)
+            # MPS
+            elif torch.backends.mps.is_available():
+                with self.temporary_cuda(self.autoregressive
+                ) as autoregressive, torch.autocast(device_type="mps", dtype=torch.float16, enabled=self.half):
+                    for b in tqdm(range(num_batches), disable=not verbose):
+                        codes = autoregressive.inference_speech(auto_conditioning, text_tokens,
+                                                                    do_sample=True,
+                                                                    top_p=top_p,
+                                                                    temperature=temperature,
+                                                                    num_return_sequences=self.autoregressive_batch_size,
+                                                                    length_penalty=length_penalty,
+                                                                    repetition_penalty=repetition_penalty,
+                                                                    max_generate_length=max_mel_tokens,
+                                                                    **hf_generate_kwargs)
+                        padding_needed = max_mel_tokens - codes.shape[1]
+                        codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
+                        samples.append(codes)
+
+            # CPU
+            # DirectML doesn't support autocast for now.
+            # https://github.com/microsoft/DirectML/issues/454#issuecomment-1703862192
             else:
                 with self.temporary_cuda(self.autoregressive) as autoregressive:
                     for b in tqdm(range(num_batches), disable=not verbose):
@@ -432,9 +480,10 @@ class TextToSpeech:
 
             clip_results = []
             
-            if not torch.backends.mps.is_available():
+            # CUDA
+            if torch.cuda.is_available():
                 with self.temporary_cuda(self.clvp) as clvp, torch.autocast(
-                    device_type="cuda" if not torch.backends.mps.is_available() else 'mps', dtype=torch.float16, enabled=self.half
+                    device_type="cuda", dtype=torch.float16, enabled=self.half
                 ):
                     if cvvp_amount > 0:
                         if self.cvvp is None:
@@ -464,6 +513,43 @@ class TextToSpeech:
                     clip_results = torch.cat(clip_results, dim=0)
                     samples = torch.cat(samples, dim=0)
                     best_results = samples[torch.topk(clip_results, k=k).indices]
+            
+            # MPS
+            if torch.backends.mps.is_available():
+                with self.temporary_cuda(self.clvp) as clvp, torch.autocast(
+                    device_type="mps", dtype=torch.float16, enabled=self.half
+                ):
+                    if cvvp_amount > 0:
+                        if self.cvvp is None:
+                            self.load_cvvp()
+                        self.cvvp = self.cvvp.to(self.device)
+                    if verbose:
+                        if self.cvvp is None:
+                            print("Computing best candidates using CLVP")
+                        else:
+                            print(f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%")
+                    for batch in tqdm(samples, disable=not verbose):
+                        for i in range(batch.shape[0]):
+                            batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
+                        if cvvp_amount != 1:
+                            clvp_out = clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False)
+                        if auto_conds is not None and cvvp_amount > 0:
+                            cvvp_accumulator = 0
+                            for cl in range(auto_conds.shape[1]):
+                                cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False)
+                            cvvp = cvvp_accumulator / auto_conds.shape[1]
+                            if cvvp_amount == 1:
+                                clip_results.append(cvvp)
+                            else:
+                                clip_results.append(cvvp * cvvp_amount + clvp_out * (1-cvvp_amount))
+                        else:
+                            clip_results.append(clvp_out)
+                    clip_results = torch.cat(clip_results, dim=0)
+                    samples = torch.cat(samples, dim=0)
+                    best_results = samples[torch.topk(clip_results, k=k).indices]
+            
+            # CPU
+            # DirectML does not support autocast for now
             else:
                 with self.temporary_cuda(self.clvp) as clvp:
                     if cvvp_amount > 0:
@@ -501,21 +587,24 @@ class TextToSpeech:
             # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
             # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
             # results, but will increase memory usage.
-            if not torch.backends.mps.is_available():
-                with self.temporary_cuda(
-                    self.autoregressive
-                ) as autoregressive, torch.autocast(
-                    device_type="cuda" if not torch.backends.mps.is_available() else 'mps', dtype=torch.float16, enabled=self.half
-                ):
+            if torch.cuda.is_available():
+                with self.temporary_cuda(self.autoregressive) as autoregressive, torch.autocast(
+                    device_type="cuda", dtype=torch.float16, enabled=self.half):
+                    best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
+                                                    torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results,
+                                                    torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
+                                                    return_latent=True, clip_inputs=False)
+                    del auto_conditioning
+            elif torch.backends.mps.is_available():
+                with self.temporary_cuda(self.autoregressive) as autoregressive, torch.autocast(
+                    device_type="mps", dtype=torch.float16, enabled=self.half):
                     best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
                                                     torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results,
                                                     torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
                                                     return_latent=True, clip_inputs=False)
                     del auto_conditioning
             else:
-                with self.temporary_cuda(
-                    self.autoregressive
-                ) as autoregressive:
+                with self.temporary_cuda(self.autoregressive) as autoregressive:
                     best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
                                                     torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), best_results,
                                                     torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
@@ -525,10 +614,8 @@ class TextToSpeech:
             if verbose:
                 print("Transforming autoregressive outputs into audio..")
             wav_candidates = []
-            if not torch.backends.mps.is_available():
-                with self.temporary_cuda(self.diffusion) as diffusion, self.temporary_cuda(
-                    self.vocoder
-                ) as vocoder:
+            if torch.cuda.is_available() or torch.backends.mps.is_available() or torch_directml.is_available():
+                with self.temporary_cuda(self.diffusion) as diffusion, self.temporary_cuda(self.vocoder) as vocoder:
                     for b in range(best_results.shape[0]):
                         codes = best_results[b].unsqueeze(0)
                         latents = best_latents[b].unsqueeze(0)
diff --git a/tortoise/models/diffusion_decoder.py b/tortoise/models/diffusion_decoder.py
index e969129..8f48ca7 100644
--- a/tortoise/models/diffusion_decoder.py
+++ b/tortoise/models/diffusion_decoder.py
@@ -302,7 +302,7 @@ class DiffusionTts(nn.Module):
                 unused_params.extend(list(lyr.parameters()))
             else:
                 # First and last blocks will have autocast disabled for improved precision.
-                if not torch.backends.mps.is_available():
+                if torch.cuda.is_available():
                     with autocast(x.device.type, enabled=self.enable_fp16 and i != 0):
                         x = lyr(x, time_emb)
                 else:
diff --git a/tortoise/models/vocoder.py b/tortoise/models/vocoder.py
index 8b60dbd..b5f1c41 100644
--- a/tortoise/models/vocoder.py
+++ b/tortoise/models/vocoder.py
@@ -1,4 +1,4 @@
-import torch
+import torch, torch_directml
 import torch.nn as nn
 import torch.nn.functional as F
 
@@ -208,10 +208,16 @@ class LVCBlock(torch.nn.Module):
         x = x.unfold(4, kernel_size, 1)  # (batch, in_channels, kernel_length, dilation, _, kernel_size)
 
         o = torch.einsum('bildsk,biokl->bolsd', x, kernel)
-        o = o.to(memory_format=torch.channels_last_3d)
-        bias = bias.unsqueeze(-1).unsqueeze(-1).to(memory_format=torch.channels_last_3d)
-        o = o + bias
-        o = o.contiguous().view(batch, out_channels, -1)
+
+        if torch_directml.is_available():
+            o = o + bias.unsqueeze(-1).unsqueeze(-1)
+            o = o.contiguous().view(batch, out_channels, -1)
+
+        else:
+            o = o.to(memory_format=torch.channels_last_3d)
+            bias = bias.unsqueeze(-1).unsqueeze(-1).to(memory_format=torch.channels_last_3d)
+            o = o + bias
+            o = o.contiguous().view(batch, out_channels, -1)
 
         return o