This commit is contained in:
Bryan Bonvallet 2024-11-25 21:39:19 +00:00 committed by GitHub
commit d2c707c095
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 104 additions and 1 deletions

View file

@ -397,6 +397,7 @@ class TextToSpeech:
auto_conditioning, diffusion_conditioning = conditioning_latents
else:
auto_conditioning, diffusion_conditioning = self.get_random_conditioning_latents()
debug_conditioning = (auto_conditioning, diffusion_conditioning)
auto_conditioning = auto_conditioning.to(self.device)
diffusion_conditioning = diffusion_conditioning.to(self.device)
@ -592,7 +593,7 @@ class TextToSpeech:
res = wav_candidates[0]
if return_deterministic_state:
return res, (deterministic_seed, text, voice_samples, conditioning_latents)
return res, (deterministic_seed, text, voice_samples, conditioning_latents, debug_conditioning)
else:
return res
def deterministic_state(self, seed=None):

102
tortoise/gradiowrapper.py Normal file
View file

@ -0,0 +1,102 @@
import torch
import torchaudio
import datetime
import tempfile
import gradio as gr
#import ffmpegio
# import numpy as np
from api import TextToSpeech, MODELS_DIR
from utils.audio import load_voices
tts = TextToSpeech(models_dir=MODELS_DIR, use_deepspeed=False, kv_cache=True, half=True)
title = "TortoiseTTS UI"
description = "TUDDLE over Gradio"
article = "<p style='text-align: center'><a href='https://github.com/neonbjb/tortoise-tts' target='_blank' class='footer'>Github Repo</a></p>"
examples = [
]
# where is this coded in the tts generative model code?
sample_rate = 24000
def inference(speakers, text, seed, diterations):
#get_debug_info = True if speakers == 'random' else False
get_debug_info = True
if ',' in speakers:
voice_sel = speakers.split(',')
else:
voice_sel = [speakers]
voice_samples, conditioning_latents = load_voices(voice_sel)
if seed < 0:
seed = None
start = datetime.datetime.now()
# k is how many samples to run
# cvvp amount above 0 if you need to reduce multiple speakers
# max_mel_tokens is the max number of 1/20 second length tokens used by something under the hood and impacts output duration. 500 ~= 25 seconds
retval = tts.tts(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents,
num_autoregressive_samples=96, diffusion_iterations=int(diterations), max_mel_tokens=500,
use_deterministic_seed=seed, cvvp_amount=0.0, return_deterministic_state=get_debug_info)
debug_info = None
conditioning_latents = None
if get_debug_info:
gen, debug_info = retval
conditioning_latents = debug_info[4]
with tempfile.NamedTemporaryFile(suffix=".pth", delete=False) as fp:
torch.save(conditioning_latents, fp.name)
debug_info = fp.name
else:
gen = retval
if isinstance(gen, list):
raise gr.Error("Keep k=1 to generate a single audio file.")
audio_array = gen.squeeze(0).cpu()
#with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as fp:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
torchaudio.save(fp.name, audio_array, sample_rate)
print(fp.name, debug_info)
print("duration", datetime.datetime.now() - start)
return (fp.name, debug_info)
#with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as mp3fp:
# ffmpegio.transcode(fp.name, mp3fp.name, overwrite=True)
# return mp3fp.name
gr.Interface(
fn=inference,
inputs=[
gr.components.Textbox(
label="Speaker",
value="random"
),
gr.components.Textbox(
label="Text",
value="Hello, my dog is cute",
),
gr.components.Number(
label="Seed",
value=-1,
),
gr.components.Number(
label="DIterations",
value=80,
minimum=30,
maximum=400,
),
],
outputs=[
gr.components.Audio(label="Speech", type="filepath"),
gr.components.File(label="Latent from Random", type="file"),
],
title=title,
description=description,
article=article,
examples=examples,
allow_flagging='never',
).launch(debug=False, enable_queue=True, server_name="0.0.0.0")