Add support for extracting and feeding conditioning latents directly into the model

- Adds a new script and API endpoints for doing this - Reworks autoregressive and diffusion models so that the conditioning is computed separately (which will actually provide a mild performance boost) - Updates README This is untested. Need to do the following manual tests (and someday write unit tests for this behemoth before it becomes a problem..) 1) Does get_conditioning_latents.py work? 2) Can I feed those latents back into the model by creating a new voice? 3) Can I still mix and match voices (both with conditioning latents and normal voices) with read.py?
2026-04-09 00:14:01 +00:00 · 2022-05-01 17:25:18 -06:00 · 2022-05-01 17:25:18 -06:00 · 0ffc191408
commit 0ffc191408
parent a8264f5cef
8 changed files with 165 additions and 78 deletions
--- a/tortoise/utils/audio.py
+++ b/tortoise/utils/audio.py
@ -91,6 +91,37 @@ def get_voices():
    return voices


+def load_voice(voice):
+    voices = get_voices()
+    paths = voices[voice]
+    if len(paths) == 1 and paths[0].endswith('.pth'):
+        return None, torch.load(paths[0])
+    else:
+        conds = []
+        for cond_path in paths:
+            c = load_audio(cond_path, 22050)
+            conds.append(c)
+        return conds, None
+
+
+def load_voices(voices):
+    latents = []
+    clips = []
+    for voice in voices:
+        latent, clip = load_voice(voice)
+        if latent is None:
+            assert len(latents) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
+            clips.extend(clip)
+        elif voice is None:
+            assert len(voices) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
+            latents.append(latent)
+    if len(latents) == 0:
+        return clips
+    else:
+        latents = torch.stack(latents, dim=0)
+        return latents.mean(dim=0)
+
+
 class TacotronSTFT(torch.nn.Module):
    def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
                 n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,