mirror of
https://github.com/neonbjb/tortoise-tts.git
synced 2026-04-09 00:14:01 +00:00
Add support for extracting and feeding conditioning latents directly into the model
- Adds a new script and API endpoints for doing this - Reworks autoregressive and diffusion models so that the conditioning is computed separately (which will actually provide a mild performance boost) - Updates README This is untested. Need to do the following manual tests (and someday write unit tests for this behemoth before it becomes a problem..) 1) Does get_conditioning_latents.py work? 2) Can I feed those latents back into the model by creating a new voice? 3) Can I still mix and match voices (both with conditioning latents and normal voices) with read.py?
This commit is contained in:
parent
a8264f5cef
commit
0ffc191408
8 changed files with 165 additions and 78 deletions
|
|
@ -91,6 +91,37 @@ def get_voices():
|
|||
return voices
|
||||
|
||||
|
||||
def load_voice(voice):
|
||||
voices = get_voices()
|
||||
paths = voices[voice]
|
||||
if len(paths) == 1 and paths[0].endswith('.pth'):
|
||||
return None, torch.load(paths[0])
|
||||
else:
|
||||
conds = []
|
||||
for cond_path in paths:
|
||||
c = load_audio(cond_path, 22050)
|
||||
conds.append(c)
|
||||
return conds, None
|
||||
|
||||
|
||||
def load_voices(voices):
|
||||
latents = []
|
||||
clips = []
|
||||
for voice in voices:
|
||||
latent, clip = load_voice(voice)
|
||||
if latent is None:
|
||||
assert len(latents) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
|
||||
clips.extend(clip)
|
||||
elif voice is None:
|
||||
assert len(voices) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
|
||||
latents.append(latent)
|
||||
if len(latents) == 0:
|
||||
return clips
|
||||
else:
|
||||
latents = torch.stack(latents, dim=0)
|
||||
return latents.mean(dim=0)
|
||||
|
||||
|
||||
class TacotronSTFT(torch.nn.Module):
|
||||
def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
|
||||
n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue