add fast api for tortoise

2026-03-15 09:53:51 +01:00 · 2023-10-18 18:52:39 +05:30 · 2023-10-18 18:52:39 +05:30 · ab270c7b31
parent 5bbb0e0b97
commit ab270c7b31
7 changed files with 1969 additions and 35 deletions
--- a/setup.py
+++ b/setup.py
@ -29,7 +29,7 @@ setuptools.setup(
        'librosa',
        'transformers==4.31.0',
        'tokenizers',
-        'deepspeed==0.8.3',
+        # 'deepspeed==0.8.3',
    ],
    classifiers=[
        "Programming Language :: Python :: 3",
--- a/tortoise/api.py
+++ b/tortoise/api.py
@ -23,6 +23,7 @@ from tortoise.utils.diffusion import SpacedDiffusion, space_timesteps, get_named
 from tortoise.utils.tokenizer import VoiceBpeTokenizer
 from tortoise.utils.wav2vec_alignment import Wav2VecAlignment
 from contextlib import contextmanager
+from huggingface_hub import hf_hub_download
 pbar = None

 DEFAULT_MODELS_DIR = os.path.join(os.path.expanduser('~'), '.cache', 'tortoise', 'models')
@ -38,44 +39,13 @@ MODELS = {
    'rlg_diffuser.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_diffuser.pth',
 }

-def download_models(specific_models=None):
-    """
-    Call to download all the models that Tortoise uses.
-    """
-    os.makedirs(MODELS_DIR, exist_ok=True)
-
-    def show_progress(block_num, block_size, total_size):
-        global pbar
-        if pbar is None:
-            pbar = progressbar.ProgressBar(maxval=total_size)
-            pbar.start()
-
-        downloaded = block_num * block_size
-        if downloaded < total_size:
-            pbar.update(downloaded)
-        else:
-            pbar.finish()
-            pbar = None
-    for model_name, url in MODELS.items():
-        if specific_models is not None and model_name not in specific_models:
-            continue
-        model_path = os.path.join(MODELS_DIR, model_name)
-        if os.path.exists(model_path):
-            continue
-        print(f'Downloading {model_name} from {url}...')
-        request.urlretrieve(url, model_path, show_progress)
-        print('Done.')
-
-
 def get_model_path(model_name, models_dir=MODELS_DIR):
    """
    Get path to given model, download it if it doesn't exist.
    """
    if model_name not in MODELS:
        raise ValueError(f'Model {model_name} not found in available models.')
-    model_path = os.path.join(models_dir, model_name)
-    if not os.path.exists(model_path) and models_dir == MODELS_DIR:
-        download_models([model_name])
+    model_path = hf_hub_download(repo_id="Manmay/tortoise-tts", filename=model_name, cache_dir=models_dir)
    return model_path


--- a/tortoise/api_fast.py
+++ b/tortoise/api_fast.py
@ -0,0 +1,500 @@
+import os
+import random
+import uuid
+from time import time
+from urllib import request
+
+import torch
+import torch.nn.functional as F
+import progressbar
+import torchaudio
+import numpy as np
+from tortoise.models.classifier import AudioMiniEncoderWithClassifierHead
+from tortoise.models.diffusion_decoder import DiffusionTts
+from tortoise.models.autoregressive import UnifiedVoice
+from tqdm import tqdm
+from tortoise.models.arch_util import TorchMelSpectrogram
+from tortoise.models.clvp import CLVP
+from tortoise.models.cvvp import CVVP
+from tortoise.models.hifigan_decoder import HifiganGenerator
+from tortoise.models.random_latent_generator import RandomLatentConverter
+from tortoise.models.vocoder import UnivNetGenerator
+from tortoise.utils.audio import wav_to_univnet_mel, denormalize_tacotron_mel
+from tortoise.utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule
+from tortoise.utils.tokenizer import VoiceBpeTokenizer
+from tortoise.utils.wav2vec_alignment import Wav2VecAlignment
+from contextlib import contextmanager
+from tortoise.models.stream_generator import init_stream_support
+from huggingface_hub import hf_hub_download
+pbar = None
+init_stream_support()
+DEFAULT_MODELS_DIR = os.path.join(os.path.expanduser('~'), '.cache', 'tortoise', 'models')
+MODELS_DIR = os.environ.get('TORTOISE_MODELS_DIR', DEFAULT_MODELS_DIR)
+
+MODELS = {
+    'autoregressive.pth': 'https://huggingface.co/Manmay/tortoise-tts/resolve/main/autoregressive.pth',
+    'classifier.pth': 'https://huggingface.co/Manmay/tortoise-tts/resolve/main/classifier.pth',
+    'rlg_auto.pth': 'https://huggingface.co/Manmay/tortoise-tts/resolve/main/rlg_auto.pth',
+    'hifidecoder.pth': 'https://huggingface.co/Manmay/tortoise-tts/resolve/main/hifidecoder.pth',
+}
+
+def get_model_path(model_name, models_dir=MODELS_DIR):
+    """
+    Get path to given model, download it if it doesn't exist.
+    """
+    if model_name not in MODELS:
+        raise ValueError(f'Model {model_name} not found in available models.')
+    model_path = hf_hub_download(repo_id="Manmay/tortoise-tts", filename=model_name, cache_dir=models_dir)
+    return model_path
+
+
+def pad_or_truncate(t, length):
+    """
+    Utility function for forcing <t> to have the specified sequence length, whether by clipping it or padding it with 0s.
+    """
+    if t.shape[-1] == length:
+        return t
+    elif t.shape[-1] < length:
+        return F.pad(t, (0, length-t.shape[-1]))
+    else:
+        return t[..., :length]
+
+
+def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusion_steps=200, cond_free=True, cond_free_k=1):
+    """
+    Helper function to load a GaussianDiffusion instance configured for use as a vocoder.
+    """
+    return SpacedDiffusion(use_timesteps=space_timesteps(trained_diffusion_steps, [desired_diffusion_steps]), model_mean_type='epsilon',
+                           model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', trained_diffusion_steps),
+                           conditioning_free=cond_free, conditioning_free_k=cond_free_k)
+
+
+def format_conditioning(clip, cond_length=132300, device="cuda" if not torch.backends.mps.is_available() else 'mps'):
+    """
+    Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models.
+    """
+    gap = clip.shape[-1] - cond_length
+    if gap < 0:
+        clip = F.pad(clip, pad=(0, abs(gap)))
+    elif gap > 0:
+        rand_start = random.randint(0, gap)
+        clip = clip[:, rand_start:rand_start + cond_length]
+    mel_clip = TorchMelSpectrogram()(clip.unsqueeze(0)).squeeze(0)
+    return mel_clip.unsqueeze(0).to(device)
+
+
+def fix_autoregressive_output(codes, stop_token, complain=True):
+    """
+    This function performs some padding on coded audio that fixes a mismatch issue between what the diffusion model was
+    trained on and what the autoregressive code generator creates (which has no padding or end).
+    This is highly specific to the DVAE being used, so this particular coding will not necessarily work if used with
+    a different DVAE. This can be inferred by feeding a audio clip padded with lots of zeros on the end through the DVAE
+    and copying out the last few codes.
+
+    Failing to do this padding will produce speech with a harsh end that sounds like "BLAH" or similar.
+    """
+    # Strip off the autoregressive stop token and add padding.
+    stop_token_indices = (codes == stop_token).nonzero()
+    if len(stop_token_indices) == 0:
+        if complain:
+            print("No stop tokens found in one of the generated voice clips. This typically means the spoken audio is "
+                  "too long. In some cases, the output will still be good, though. Listen to it and if it is missing words, "
+                  "try breaking up your input text.")
+        return codes
+    else:
+        codes[stop_token_indices] = 83
+    stm = stop_token_indices.min().item()
+    codes[stm:] = 83
+    if stm - 3 < codes.shape[0]:
+        codes[-3] = 45
+        codes[-2] = 45
+        codes[-1] = 248
+
+    return codes
+
+
+def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True):
+    """
+    Uses the specified diffusion model to convert discrete codes into a spectrogram.
+    """
+    with torch.no_grad():
+        output_seq_len = latents.shape[1] * 4 * 24000 // 22050  # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
+        output_shape = (latents.shape[0], 100, output_seq_len)
+        precomputed_embeddings = diffusion_model.timestep_independent(latents, conditioning_latents, output_seq_len, False)
+
+        noise = torch.randn(output_shape, device=latents.device) * temperature
+        mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise,
+                                      model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings},
+                                     progress=verbose)
+        return denormalize_tacotron_mel(mel)[:,:,:output_seq_len]
+
+
+def classify_audio_clip(clip):
+    """
+    Returns whether or not Tortoises' classifier thinks the given clip came from Tortoise.
+    :param clip: torch tensor containing audio waveform data (get it from load_audio)
+    :return: True if the clip was classified as coming from Tortoise and false if it was classified as real.
+    """
+    classifier = AudioMiniEncoderWithClassifierHead(2, spec_dim=1, embedding_dim=512, depth=5, downsample_factor=4,
+                                                    resnet_blocks=2, attn_blocks=4, num_attn_heads=4, base_channels=32,
+                                                    dropout=0, kernel_size=5, distribute_zero_label=False)
+    classifier.load_state_dict(torch.load(get_model_path('classifier.pth'), map_location=torch.device('cpu')))
+    clip = clip.cpu().unsqueeze(0)
+    results = F.softmax(classifier(clip), dim=-1)
+    return results[0][0]
+
+
+def pick_best_batch_size_for_gpu():
+    """
+    Tries to pick a batch size that will fit in your GPU. These sizes aren't guaranteed to work, but they should give
+    you a good shot.
+    """
+    if torch.cuda.is_available():
+        _, available = torch.cuda.mem_get_info()
+        availableGb = available / (1024 ** 3)
+        if availableGb > 14:
+            return 16
+        elif availableGb > 10:
+            return 8
+        elif availableGb > 7:
+            return 4
+    if torch.backends.mps.is_available():
+        import psutil
+        available = psutil.virtual_memory().total
+        availableGb = available / (1024 ** 3)
+        if availableGb > 14:
+            return 16
+        elif availableGb > 10:
+            return 8
+        elif availableGb > 7:
+            return 4
+    return 1
+
+class TextToSpeech:
+    """
+    Main entry point into Tortoise.
+    """
+
+    def __init__(self, autoregressive_batch_size=None, models_dir=MODELS_DIR, 
+                 enable_redaction=True, kv_cache=False, use_deepspeed=False, half=False, device=None,
+                 tokenizer_vocab_file=None, tokenizer_basic=False):
+
+        """
+        Constructor
+        :param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
+                                          GPU OOM errors. Larger numbers generates slightly faster.
+        :param models_dir: Where model weights are stored. This should only be specified if you are providing your own
+                           models, otherwise use the defaults.
+        :param enable_redaction: When true, text enclosed in brackets are automatically redacted from the spoken output
+                                 (but are still rendered by the model). This can be used for prompt engineering.
+                                 Default is true.
+        :param device: Device to use when running the model. If omitted, the device will be automatically chosen.
+        """
+        self.models_dir = models_dir
+        self.autoregressive_batch_size = pick_best_batch_size_for_gpu() if autoregressive_batch_size is None else autoregressive_batch_size
+        self.enable_redaction = enable_redaction
+        self.device = torch.device('cuda' if torch.cuda.is_available() else'cpu')
+        if torch.backends.mps.is_available():
+            self.device = torch.device('mps')
+        if self.enable_redaction:
+            self.aligner = Wav2VecAlignment()
+
+        self.tokenizer = VoiceBpeTokenizer(
+            vocab_file=tokenizer_vocab_file,
+            use_basic_cleaners=tokenizer_basic,
+        )
+        self.half = half
+        if os.path.exists(f'{models_dir}/autoregressive.ptt'):
+            # Assume this is a traced directory.
+            self.autoregressive = torch.jit.load(f'{models_dir}/autoregressive.ptt')
+        else:
+            self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
+                                          model_dim=1024,
+                                          heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
+                                          train_solo_embeddings=False).to(self.device).eval()
+            self.autoregressive.load_state_dict(torch.load(get_model_path('autoregressive.pth', models_dir)), strict=False)
+            self.autoregressive.post_init_gpt2_config(use_deepspeed=use_deepspeed, kv_cache=kv_cache, half=self.half)
+
+        self.hifi_decoder = HifiganGenerator(in_channels=1024, out_channels = 1, resblock_type = "1",
+        resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]], resblock_kernel_sizes = [3, 7, 11],
+        upsample_kernel_sizes = [16, 16, 4, 4], upsample_initial_channel = 512, upsample_factors = [8, 8, 2, 2],
+        cond_channels=1024).to(self.device).eval()
+        hifi_model = torch.load(get_model_path('hifidecoder.pth'))
+        self.hifi_decoder.load_state_dict(hifi_model, strict=False)
+        # Random latent generators (RLGs) are loaded lazily.
+        self.rlg_auto = None
+    def get_conditioning_latents(self, voice_samples, return_mels=False):
+        """
+        Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
+        These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
+        properties.
+        :param voice_samples: List of 2 or more ~10 second reference clips, which should be torch tensors containing 22.05kHz waveform data.
+        """
+        with torch.no_grad():
+            voice_samples = [v.to(self.device) for v in voice_samples]
+
+            auto_conds = []
+            if not isinstance(voice_samples, list):
+                voice_samples = [voice_samples]
+            for vs in voice_samples:
+                auto_conds.append(format_conditioning(vs, device=self.device))
+            auto_conds = torch.stack(auto_conds, dim=1)
+            auto_latent = self.autoregressive.get_conditioning(auto_conds)
+
+        if return_mels:
+            return auto_latent
+        else:
+            return auto_latent
+
+    def get_random_conditioning_latents(self):
+        # Lazy-load the RLG models.
+        if self.rlg_auto is None:
+            self.rlg_auto = RandomLatentConverter(1024).eval()
+            self.rlg_auto.load_state_dict(torch.load(get_model_path('rlg_auto.pth', self.models_dir), map_location=torch.device('cpu')))
+        with torch.no_grad():
+            return self.rlg_auto(torch.tensor([0.0]))
+
+    def tts_with_preset(self, text, preset='fast', **kwargs):
+        """
+        Calls TTS with one of a set of preset generation parameters. Options:
+            'ultra_fast': Produces speech at a speed which belies the name of this repo. (Not really, but it's definitely fastest).
+            'fast': Decent quality speech at a decent inference rate. A good choice for mass inference.
+            'standard': Very good quality. This is generally about as good as you are going to get.
+            'high_quality': Use if you want the absolute best. This is not really worth the compute, though.
+        """
+        # Use generally found best tuning knobs for generation.
+        settings = {'temperature': .8, 'length_penalty': 1.0, 'repetition_penalty': 2.0,
+                    'top_p': .8,
+                    'cond_free_k': 2.0, 'diffusion_temperature': 1.0}
+        # Presets are defined here.
+        presets = {
+            'ultra_fast': {'num_autoregressive_samples': 1, 'diffusion_iterations': 10},
+            'fast': {'num_autoregressive_samples': 32, 'diffusion_iterations': 50},
+            'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200},
+            'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400},
+        }
+        settings.update(presets[preset])
+        settings.update(kwargs) # allow overriding of preset settings with kwargs
+        for audio_frame in self.tts(text, **settings):
+            yield audio_frame
+    # taken from here https://github.com/coqui-ai/TTS/blob/d21f15cc850788f9cdf93dac0321395138665287/TTS/tts/models/xtts.py#L666
+    def handle_chunks(self, wav_gen, wav_gen_prev, wav_overlap, overlap_len):
+        """Handle chunk formatting in streaming mode"""
+        wav_chunk = wav_gen[:-overlap_len]
+        if wav_gen_prev is not None:
+            wav_chunk = wav_gen[(wav_gen_prev.shape[0] - overlap_len) : -overlap_len]
+        if wav_overlap is not None:
+            crossfade_wav = wav_chunk[:overlap_len]
+            crossfade_wav = crossfade_wav * torch.linspace(0.0, 1.0, overlap_len).to(crossfade_wav.device)
+            wav_chunk[:overlap_len] = wav_overlap * torch.linspace(1.0, 0.0, overlap_len).to(wav_overlap.device)
+            wav_chunk[:overlap_len] += crossfade_wav
+        wav_overlap = wav_gen[-overlap_len:]
+        wav_gen_prev = wav_gen
+        return wav_chunk, wav_gen_prev, wav_overlap
+
+
+    def tts_stream(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True, use_deterministic_seed=None,
+            return_deterministic_state=False, overlap_wav_len=1024, stream_chunk_size=40,
+            # autoregressive generation parameters follow
+            num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8, max_mel_tokens=500,
+            # CVVP parameters follow
+            cvvp_amount=.0,
+            # diffusion generation parameters follow
+            diffusion_iterations=100, cond_free=True, cond_free_k=2, diffusion_temperature=1.0,
+            **hf_generate_kwargs):
+        """
+        Produces an audio clip of the given text being spoken with the given reference voice.
+        :param text: Text to be spoken.
+        :param voice_samples: List of 2 or more ~10 second reference clips which should be torch tensors containing 22.05kHz waveform data.
+        :param conditioning_latents: A tuple of (autoregressive_conditioning_latent, diffusion_conditioning_latent), which
+                                     can be provided in lieu of voice_samples. This is ignored unless voice_samples=None.
+                                     Conditioning latents can be retrieved via get_conditioning_latents().
+        :param k: The number of returned clips. The most likely (as determined by Tortoises' CLVP model) clips are returned.
+        :param verbose: Whether or not to print log messages indicating the progress of creating a clip. Default=true.
+        ~~AUTOREGRESSIVE KNOBS~~
+        :param num_autoregressive_samples: Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
+               As Tortoise is a probabilistic model, more samples means a higher probability of creating something "great".
+        :param temperature: The softmax temperature of the autoregressive model.
+        :param length_penalty: A length penalty applied to the autoregressive decoder. Higher settings causes the model to produce more terse outputs.
+        :param repetition_penalty: A penalty that prevents the autoregressive decoder from repeating itself during decoding. Can be used to reduce the incidence
+                                   of long silences or "uhhhhhhs", etc.
+        :param top_p: P value used in nucleus sampling. (0,1]. Lower values mean the decoder produces more "likely" (aka boring) outputs.
+        :param max_mel_tokens: Restricts the output length. (0,600] integer. Each unit is 1/20 of a second.
+        ~~DIFFUSION KNOBS~~
+        :param diffusion_iterations: Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine
+                                     the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better,
+                                     however.
+        :param cond_free: Whether or not to perform conditioning-free diffusion. Conditioning-free diffusion performs two forward passes for
+                          each diffusion step: one with the outputs of the autoregressive model and one with no conditioning priors. The output
+                          of the two is blended according to the cond_free_k value below. Conditioning-free diffusion is the real deal, and
+                          dramatically improves realism.
+        :param cond_free_k: Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
+                            As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
+                            Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k
+        :param diffusion_temperature: Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
+                                      are the "mean" prediction of the diffusion network and will sound bland and smeared.
+        ~~OTHER STUFF~~
+        :param hf_generate_kwargs: The huggingface Transformers generate API is used for the autoregressive transformer.
+                                   Extra keyword args fed to this function get forwarded directly to that API. Documentation
+                                   here: https://huggingface.co/docs/transformers/internal/generation_utils
+        :return: Generated audio clip(s) as a torch tensor. Shape 1,S if k=1 else, (k,1,S) where S is the sample length.
+                 Sample rate is 24kHz.
+        """
+        deterministic_seed = self.deterministic_state(seed=use_deterministic_seed)
+
+        text_tokens = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0).to(self.device)
+        text_tokens = F.pad(text_tokens, (0, 1))  # This may not be necessary.
+        assert text_tokens.shape[-1] < 400, 'Too much text provided. Break the text up into separate segments and re-try inference.'
+        if voice_samples is not None:
+            auto_conditioning = self.get_conditioning_latents(voice_samples, return_mels=False)
+        else:
+            auto_conditioning  = self.get_random_conditioning_latents()
+        auto_conditioning = auto_conditioning.to(self.device)
+
+        with torch.no_grad():
+            calm_token = 83  # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
+            if verbose:
+                print("Generating autoregressive samples..")
+            with torch.autocast(
+                    device_type="cuda" , dtype=torch.float16, enabled=self.half
+                ):
+                fake_inputs = self.autoregressive.compute_embeddings(
+                    auto_conditioning,
+                    text_tokens,
+                )
+                gpt_generator = self.autoregressive.get_generator(
+                    fake_inputs=fake_inputs,
+                    top_k=50,
+                    top_p=top_p,
+                    temperature=temperature,
+                    do_sample=True,
+                    num_beams=1,
+                    num_return_sequences=1,
+                    length_penalty=float(length_penalty),
+                    repetition_penalty=float(repetition_penalty),
+                    output_attentions=False,
+                    output_hidden_states=True,
+                    **hf_generate_kwargs,
+                )
+            all_latents = []
+            codes_ = []
+            wav_gen_prev = None
+            wav_overlap = None
+            is_end = False
+            first_buffer = 60
+            while not is_end:
+                try:
+                    with torch.autocast(
+                        device_type="cuda", dtype=torch.float16, enabled=self.half
+                    ):
+                        codes, latent = next(gpt_generator)
+                        all_latents += [latent]
+                        codes_ += [codes]
+                except StopIteration:
+                    is_end = True
+
+                if is_end or (stream_chunk_size > 0 and len(codes_) >= max(stream_chunk_size, first_buffer)):
+                    first_buffer = 0
+                    gpt_latents = torch.cat(all_latents, dim=0)[None, :]
+                    wav_gen = self.hifi_decoder.inference(gpt_latents.to(self.device), auto_conditioning)
+                    wav_gen = wav_gen.squeeze()
+                    wav_chunk, wav_gen_prev, wav_overlap = self.handle_chunks(
+                        wav_gen.squeeze(), wav_gen_prev, wav_overlap, overlap_wav_len
+                    )
+                    codes_ = []
+                    yield wav_chunk
+    def tts(self, text, voice_samples=None, k=1, verbose=True, use_deterministic_seed=None,
+            # autoregressive generation parameters follow
+            num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, 
+            top_p=.8, max_mel_tokens=500,
+            # CVVP parameters follow
+            cvvp_amount=.0,
+            **hf_generate_kwargs):
+        """
+        Produces an audio clip of the given text being spoken with the given reference voice.
+        :param text: Text to be spoken.
+        :param voice_samples: List of 2 or more ~10 second reference clips which should be torch tensors containing 22.05kHz waveform data.
+        :param conditioning_latents: A tuple of (autoregressive_conditioning_latent, diffusion_conditioning_latent), which
+                                     can be provided in lieu of voice_samples. This is ignored unless voice_samples=None.
+                                     Conditioning latents can be retrieved via get_conditioning_latents().
+        :param k: The number of returned clips. The most likely (as determined by Tortoises' CLVP model) clips are returned.
+        :param verbose: Whether or not to print log messages indicating the progress of creating a clip. Default=true.
+        ~~AUTOREGRESSIVE KNOBS~~
+        :param num_autoregressive_samples: Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
+               As Tortoise is a probabilistic model, more samples means a higher probability of creating something "great".
+        :param temperature: The softmax temperature of the autoregressive model.
+        :param length_penalty: A length penalty applied to the autoregressive decoder. Higher settings causes the model to produce more terse outputs.
+        :param repetition_penalty: A penalty that prevents the autoregressive decoder from repeating itself during decoding. Can be used to reduce the incidence
+                                   of long silences or "uhhhhhhs", etc.
+        :param top_p: P value used in nucleus sampling. (0,1]. Lower values mean the decoder produces more "likely" (aka boring) outputs.
+        :param max_mel_tokens: Restricts the output length. (0,600] integer. Each unit is 1/20 of a second.
+        ~~DIFFUSION KNOBS~~
+        :param diffusion_iterations: Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine
+                                     the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better,
+                                     however.
+        :param cond_free: Whether or not to perform conditioning-free diffusion. Conditioning-free diffusion performs two forward passes for
+                          each diffusion step: one with the outputs of the autoregressive model and one with no conditioning priors. The output
+                          of the two is blended according to the cond_free_k value below. Conditioning-free diffusion is the real deal, and
+                          dramatically improves realism.
+        :param cond_free_k: Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
+                            As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
+                            Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k
+        :param diffusion_temperature: Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
+                                      are the "mean" prediction of the diffusion network and will sound bland and smeared.
+        ~~OTHER STUFF~~
+        :param hf_generate_kwargs: The huggingface Transformers generate API is used for the autoregressive transformer.
+                                   Extra keyword args fed to this function get forwarded directly to that API. Documentation
+                                   here: https://huggingface.co/docs/transformers/internal/generation_utils
+        :return: Generated audio clip(s) as a torch tensor. Shape 1,S if k=1 else, (k,1,S) where S is the sample length.
+                 Sample rate is 24kHz.
+        """
+        deterministic_seed = self.deterministic_state(seed=use_deterministic_seed)
+
+        text_tokens = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0).to(self.device)
+        text_tokens = F.pad(text_tokens, (0, 1))  # This may not be necessary.
+        assert text_tokens.shape[-1] < 400, 'Too much text provided. Break the text up into separate segments and re-try inference.'
+        if voice_samples is not None:
+            auto_conditioning = self.get_conditioning_latents(voice_samples, return_mels=False)
+        else:
+            auto_conditioning  = self.get_random_conditioning_latents()
+        auto_conditioning = auto_conditioning.to(self.device)
+
+        with torch.no_grad():
+            calm_token = 83  # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
+            if verbose:
+                print("Generating autoregressive samples..")
+            with torch.autocast(
+                    device_type="cuda" , dtype=torch.float16, enabled=self.half
+                ):
+                codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens,
+                                                            top_k=50,
+                                                            top_p=top_p,
+                                                            temperature=temperature,
+                                                            do_sample=True,
+                                                            num_beams=1,
+                                                            num_return_sequences=1,
+                                                            length_penalty=float(length_penalty),
+                                                            repetition_penalty=float(repetition_penalty),
+                                                            output_attentions=False,
+                                                            output_hidden_states=True,
+                                                            **hf_generate_kwargs)
+                gpt_latents = self.autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
+                                torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), codes,
+                                torch.tensor([codes.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
+                                return_latent=True, clip_inputs=False)
+            if verbose:
+                print("generating audio..")
+            wav_gen = self.hifi_decoder.inference(gpt_latents.to(self.device), auto_conditioning)
+            return wav_gen
+    def deterministic_state(self, seed=None):
+        """
+        Sets the random seeds that tortoise uses to the current time() and returns that seed so results can be
+        reproduced.
+        """
+        seed = int(time()) if seed is None else seed
+        torch.manual_seed(seed)
+        random.seed(seed)
+        # Can't currently set this because of CUBLAS. TODO: potentially enable it if necessary.
+        # torch.use_deterministic_algorithms(True)
+
+        return seed
--- a/tortoise/models/autoregressive.py
+++ b/tortoise/models/autoregressive.py
@ -38,6 +38,7 @@ class GPT2InferenceModel(GPT2PreTrainedModel):
        self.transformer = gpt
        self.text_pos_embedding = text_pos_emb
        self.embeddings = embeddings
+        self.final_norm = norm
        self.lm_head = nn.Sequential(norm, linear)
        self.kv_cache = kv_cache
        
@ -509,7 +510,28 @@ class UnifiedVoice(nn.Module):
        loss_text = F.cross_entropy(text_logits, text_targets.long())
        loss_mel = F.cross_entropy(mel_logits, mel_targets.long())
        return loss_text.mean(), loss_mel.mean(), mel_logits
-
+    def compute_embeddings(
+        self,
+        cond_latents,
+        text_inputs,
+    ):
+        text_inputs = F.pad(text_inputs, (0, 1), value=self.stop_text_token)
+        text_inputs = F.pad(text_inputs, (1, 0), value=self.start_text_token)
+        emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs)
+        conds = cond_latents.unsqueeze(1)
+        emb = torch.cat([conds, emb], dim=1)
+        self.inference_model.store_mel_emb(emb)
+        gpt_inputs = torch.full(
+            (
+                emb.shape[0],
+                emb.shape[1] + 1,  # +1 for the start_mel_token
+            ),
+            fill_value=1,
+            dtype=torch.long,
+            device=text_inputs.device,
+        )
+        gpt_inputs[:, -1] = self.start_mel_token
+        return gpt_inputs
    def inference_speech(self, speech_conditioning_latent, text_inputs, input_tokens=None, num_return_sequences=1,
                         max_generate_length=None, typical_sampling=False, typical_mass=.9, **hf_generate_kwargs):        

@ -540,7 +562,16 @@ class UnifiedVoice(nn.Module):
                                            num_return_sequences=num_return_sequences, **hf_generate_kwargs)
        return gen[:, trunc_index:]

-
+    def get_generator(self, fake_inputs, **hf_generate_kwargs):
+        return self.inference_model.generate_stream(
+            fake_inputs,
+            bos_token_id=self.start_mel_token,
+            pad_token_id=self.stop_mel_token,
+            eos_token_id=self.stop_mel_token,
+            max_length=500,
+            do_stream=True,
+            **hf_generate_kwargs,
+        )
 if __name__ == '__main__':
    gpt = UnifiedVoice(model_dim=256, heads=4, train_solo_embeddings=True, use_mel_codes_as_input=True, max_conditioning_inputs=4)
    l = gpt(torch.randn(2, 3, 80, 800),
--- a/tortoise/models/hifigan_decoder.py
+++ b/tortoise/models/hifigan_decoder.py
@ -0,0 +1,299 @@
+# adopted from https://github.com/jik876/hifi-gan/blob/master/models.py
+import torch
+from torch import nn
+from torch.nn import Conv1d, ConvTranspose1d
+from torch.nn import functional as F
+from torch.nn.utils import remove_weight_norm, weight_norm
+
+LRELU_SLOPE = 0.1
+
+
+def get_padding(k, d):
+    return int((k * d - d) / 2)
+
+
+class ResBlock1(torch.nn.Module):
+    """Residual Block Type 1. It has 3 convolutional layers in each convolutional block.
+
+    Network::
+
+        x -> lrelu -> conv1_1 -> conv1_2 -> conv1_3 -> z -> lrelu -> conv2_1 -> conv2_2 -> conv2_3 -> o -> + -> o
+        |--------------------------------------------------------------------------------------------------|
+
+
+    Args:
+        channels (int): number of hidden channels for the convolutional layers.
+        kernel_size (int): size of the convolution filter in each layer.
+        dilations (list): list of dilation value for each conv layer in a block.
+    """
+
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super().__init__()
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))
+                ),
+                weight_norm(
+                    Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))
+                ),
+                weight_norm(
+                    Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))
+                ),
+            ]
+        )
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): input tensor.
+        Returns:
+            Tensor: output tensor.
+        Shapes:
+            x: [B, C, T]
+        """
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+
+
+class ResBlock2(torch.nn.Module):
+    """Residual Block Type 2. It has 1 convolutional layers in each convolutional block.
+
+    Network::
+
+        x -> lrelu -> conv1-> -> z -> lrelu -> conv2-> o -> + -> o
+        |---------------------------------------------------|
+
+
+    Args:
+        channels (int): number of hidden channels for the convolutional layers.
+        kernel_size (int): size of the convolution filter in each layer.
+        dilations (list): list of dilation value for each conv layer in a block.
+    """
+
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
+        super().__init__()
+        self.convs = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+            ]
+        )
+
+    def forward(self, x):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c(xt)
+            x = xt + x
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+
+
+class HifiganGenerator(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        resblock_type,
+        resblock_dilation_sizes,
+        resblock_kernel_sizes,
+        upsample_kernel_sizes,
+        upsample_initial_channel,
+        upsample_factors,
+        inference_padding=5,
+        cond_channels=0,
+        conv_pre_weight_norm=True,
+        conv_post_weight_norm=True,
+        conv_post_bias=True,
+    ):
+        r"""HiFiGAN Generator with Multi-Receptive Field Fusion (MRF)
+
+        Network:
+            x -> lrelu -> upsampling_layer -> resblock1_k1x1 -> z1 -> + -> z_sum / #resblocks -> lrelu -> conv_post_7x1 -> tanh -> o
+                                                 ..          -> zI ---|
+                                              resblockN_kNx1 -> zN ---'
+
+        Args:
+            in_channels (int): number of input tensor channels.
+            out_channels (int): number of output tensor channels.
+            resblock_type (str): type of the `ResBlock`. '1' or '2'.
+            resblock_dilation_sizes (List[List[int]]): list of dilation values in each layer of a `ResBlock`.
+            resblock_kernel_sizes (List[int]): list of kernel sizes for each `ResBlock`.
+            upsample_kernel_sizes (List[int]): list of kernel sizes for each transposed convolution.
+            upsample_initial_channel (int): number of channels for the first upsampling layer. This is divided by 2
+                for each consecutive upsampling layer.
+            upsample_factors (List[int]): upsampling factors (stride) for each upsampling layer.
+            inference_padding (int): constant padding applied to the input at inference time. Defaults to 5.
+        """
+        super().__init__()
+        self.inference_padding = inference_padding
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_factors)
+        # initial upsampling layers
+        self.conv_pre = weight_norm(Conv1d(in_channels, upsample_initial_channel, 7, 1, padding=3))
+        resblock = ResBlock1 if resblock_type == "1" else ResBlock2
+        # upsampling layers
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_factors, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        # MRF blocks
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(resblock(ch, k, d))
+        # post convolution layer
+        self.conv_post = weight_norm(Conv1d(ch, out_channels, 7, 1, padding=3, bias=conv_post_bias))
+        if cond_channels > 0:
+            self.cond_layer = nn.Conv1d(cond_channels, upsample_initial_channel, 1)
+
+        if not conv_pre_weight_norm:
+            remove_weight_norm(self.conv_pre)
+
+        if not conv_post_weight_norm:
+            remove_weight_norm(self.conv_post)
+
+    def forward(self, x, g=None):
+        """
+        Args:
+            x (Tensor): feature input tensor.
+            g (Tensor): global conditioning input tensor.
+
+        Returns:
+            Tensor: output waveform.
+
+        Shapes:
+            x: [B, C, T]
+            Tensor: [B, 1, T]
+        """
+        o = self.conv_pre(x)
+        if hasattr(self, "cond_layer"):
+            o = o + self.cond_layer(g)
+        for i in range(self.num_upsamples):
+            o = F.leaky_relu(o, LRELU_SLOPE)
+            o = self.ups[i](o)
+            z_sum = None
+            for j in range(self.num_kernels):
+                if z_sum is None:
+                    z_sum = self.resblocks[i * self.num_kernels + j](o)
+                else:
+                    z_sum += self.resblocks[i * self.num_kernels + j](o)
+            o = z_sum / self.num_kernels
+        o = F.leaky_relu(o)
+        o = self.conv_post(o)
+        o = torch.tanh(o)
+        return o
+
+    @torch.no_grad()
+    def inference(self, c, g=None):
+        """
+        Args:
+            x (Tensor): conditioning input tensor.
+
+        Returns:
+            Tensor: output waveform.
+
+        Shapes:
+            x: [B, C, T]
+            Tensor: [B, 1, T]
+        """
+        # c = c.to(self.conv_pre.weight.device)
+        # c = torch.nn.functional.pad(c, (self.inference_padding, self.inference_padding), "replicate")
+        up_1 = torch.nn.functional.interpolate(
+                c.transpose(1,2),
+                scale_factor=[1024 / 256],
+                mode="linear",
+            )
+        up_2 = torch.nn.functional.interpolate(
+            up_1,
+            scale_factor=[24000 / 22050],
+            mode="linear",
+        )
+        g = g.unsqueeze(0)
+        return self.forward(up_2.to("cuda"), g.transpose(1,2))
+
+    def remove_weight_norm(self):
+        print("Removing weight norm...")
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
--- a/tortoise/models/stream_generator.py
+++ b/tortoise/models/stream_generator.py
--- a/tortoise/read_fast.py
+++ b/tortoise/read_fast.py
@ -0,0 +1,77 @@
+import argparse
+import os
+from time import time
+
+import torch
+import torchaudio
+
+from api_fast import TextToSpeech, MODELS_DIR
+from utils.audio import load_audio, load_voices
+from utils.text import split_and_recombine_text
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--textfile', type=str, help='A file containing the text to read.', default="tortoise/data/riding_hood.txt")
+    parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
+                                                 'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='lj')
+    parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/longform/')
+    parser.add_argument('--output_name', type=str, help='How to name the output file', default='combined.wav')
+    parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
+    parser.add_argument('--regenerate', type=str, help='Comma-separated list of clip numbers to re-generate, or nothing.', default=None)
+    parser.add_argument('--model_dir', type=str, help='Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this'
+                                                      'should only be specified if you have custom checkpoints.', default=MODELS_DIR)
+    parser.add_argument('--seed', type=int, help='Random seed which can be used to reproduce results.', default=None)
+    parser.add_argument('--use_deepspeed', type=bool, help='Use deepspeed for speed bump.', default=False)
+    parser.add_argument('--kv_cache', type=bool, help='If you disable this please wait for a long a time to get the output', default=True)
+    parser.add_argument('--half', type=bool, help="float16(half) precision inference if True it's faster and take less vram and ram", default=True)
+
+
+    args = parser.parse_args()
+    if torch.backends.mps.is_available():
+        args.use_deepspeed = False
+    tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed, kv_cache=args.kv_cache, half=args.half)
+
+    outpath = args.output_path
+    outname = args.output_name
+    selected_voices = args.voice.split(',')
+    regenerate = args.regenerate
+    if regenerate is not None:
+        regenerate = [int(e) for e in regenerate.split(',')]
+
+    # Process text
+    with open(args.textfile, 'r', encoding='utf-8') as f:
+        text = ' '.join([l for l in f.readlines()])
+    if '|' in text:
+        print("Found the '|' character in your text, which I will use as a cue for where to split it up. If this was not"
+              "your intent, please remove all '|' characters from the input.")
+        texts = text.split('|')
+    else:
+        texts = split_and_recombine_text(text)
+
+    seed = int(time()) if args.seed is None else args.seed
+    for selected_voice in selected_voices:
+        voice_outpath = os.path.join(outpath, selected_voice)
+        os.makedirs(voice_outpath, exist_ok=True)
+
+        if '&' in selected_voice:
+            voice_sel = selected_voice.split('&')
+        else:
+            voice_sel = [selected_voice]
+
+        voice_samples, conditioning_latents = load_voices(voice_sel)
+        all_parts = []
+        for j, text in enumerate(texts):
+            if regenerate is not None and j not in regenerate:
+                all_parts.append(load_audio(os.path.join(voice_outpath, f'{j}.wav'), 24000))
+                continue
+            start_time = time()
+            gen = tts.tts(text, voice_samples=voice_samples, use_deterministic_seed=seed)
+            end_time = time()
+            audio_ = gen.squeeze(0).cpu()
+            print("Time taken to generate the audio: ", end_time - start_time, "seconds")
+            print("RTF: ", (end_time - start_time) / (audio_.shape[1] / 24000))
+            torchaudio.save(os.path.join(voice_outpath, f'{j}.wav'), audio_, 24000)
+            all_parts.append(audio_)
+        full_audio = torch.cat(all_parts, dim=-1)
+        torchaudio.save(os.path.join(voice_outpath, f"{outname}.wav"), full_audio, 24000)