diff --git a/openui_app.py b/openui_app.py
new file mode 100644
index 0000000..72b8fe2
--- /dev/null
+++ b/openui_app.py
@@ -0,0 +1,337 @@
+from flask import Flask, request, render_template_string, send_file, jsonify
+import os
+import uuid
+import threading
+
+app = Flask(__name__)
+
+UPLOAD_FOLDER = 'uploads'
+OUTPUT_FOLDER = 'outputs'
+os.makedirs(UPLOAD_FOLDER, exist_ok=True)
+os.makedirs(OUTPUT_FOLDER, exist_ok=True)
+
+# Global job storage
+jobs = {}
+processes = {} # Track running processes for stop functionality
+
+HTML = '''
+
+
+
+
+ Tortoise TTS OpenUI
+
+
+
+
+
Tortoise TTS OpenUI
+
+
+
+
+
+'''
+
+def run_tts_with_progress(cmd, env, job_id):
+ import subprocess
+ import re
+ jobs[job_id]['progress'] = 0
+ jobs[job_id]['log'] = ''
+ jobs[job_id]['done'] = False
+ process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env, text=True, bufsize=1)
+ processes[job_id] = process
+ try:
+ for line in process.stdout:
+ jobs[job_id]['log'] += line
+ match = re.search(r"\((\d+) of (\d+)\)", line)
+ if match:
+ current, total_clips = int(match.group(1)), int(match.group(2))
+ progress = int(current / total_clips * 100)
+ jobs[job_id]['progress'] = progress
+ process.wait()
+ finally:
+ jobs[job_id]['progress'] = 100
+ jobs[job_id]['done'] = True
+ processes.pop(job_id, None)
+
+@app.route('/stop/', methods=['POST'])
+def stop_job(job_id):
+ proc = processes.get(job_id)
+ if proc and proc.poll() is None:
+ try:
+ proc.terminate()
+ try:
+ proc.wait(timeout=3)
+ except Exception:
+ proc.kill()
+ except Exception:
+ pass
+ jobs[job_id]['done'] = True
+ return jsonify({'stopped': True})
+
+@app.route('/', methods=['GET', 'POST'])
+def index():
+ if request.method == 'POST':
+ text = request.form.get('text', '')
+ # Handle uploaded text file
+ if 'text_file' in request.files and request.files['text_file']:
+ file = request.files['text_file']
+ if file.filename:
+ file_content = file.read().decode('utf-8')
+ text = file_content
+ if text:
+ filename = f"{uuid.uuid4()}.wav"
+ output_path = os.path.join(OUTPUT_FOLDER, filename)
+ cmd = [
+ 'python', 'scripts/tortoise_tts.py',
+ '-o', output_path
+ ]
+ # Add text
+ if text:
+ cmd.append(text)
+ # Optional arguments
+ voice = request.form.get('voice')
+ if voice:
+ cmd.extend(['-v', voice])
+ preset = request.form.get('preset')
+ if preset:
+ cmd.extend(['-p', preset])
+ candidates = request.form.get('candidates')
+ if candidates and str(candidates) != '1':
+ cmd.extend(['--candidates', str(candidates)])
+ seed = request.form.get('seed')
+ if seed:
+ cmd.extend(['--seed', str(seed)])
+ device = request.form.get('device')
+ if device:
+ cmd.extend(['--device', device])
+ voices_dir = request.form.get('voices_dir')
+ if voices_dir:
+ cmd.extend(['-V', voices_dir])
+ text_split = request.form.get('text_split')
+ if text_split:
+ cmd.extend(['--text-split', text_split])
+ # Tuning options
+ if request.form.get('num_autoregressive_samples'):
+ cmd.extend(['--num-autoregressive-samples', str(request.form['num_autoregressive_samples'])])
+ if request.form.get('temperature'):
+ cmd.extend(['--temperature', str(request.form['temperature'])])
+ if request.form.get('length_penalty'):
+ cmd.extend(['--length-penalty', str(request.form['length_penalty'])])
+ if request.form.get('repetition_penalty'):
+ cmd.extend(['--repetition-penalty', str(request.form['repetition_penalty'])])
+ if request.form.get('top_p'):
+ cmd.extend(['--top-p', str(request.form['top_p'])])
+ if request.form.get('max_mel_tokens'):
+ cmd.extend(['--max-mel-tokens', str(request.form['max_mel_tokens'])])
+ if request.form.get('cvvp_amount'):
+ cmd.extend(['--cvvp-amount', str(request.form['cvvp_amount'])])
+ if request.form.get('diffusion_iterations'):
+ cmd.extend(['--diffusion-iterations', str(request.form['diffusion_iterations'])])
+ if request.form.get('cond_free'):
+ cmd.append('--cond-free')
+ if request.form.get('cond_free_k'):
+ cmd.extend(['--cond-free-k', str(request.form['cond_free_k'])])
+ if request.form.get('diffusion_temperature'):
+ cmd.extend(['--diffusion-temperature', str(request.form['diffusion_temperature'])])
+ if request.form.get('quiet'):
+ cmd.append('-q')
+ if request.form.get('produce_debug_state'):
+ cmd.append('--produce-debug-state')
+ if request.form.get('skip_existing'):
+ cmd.append('--skip-existing')
+ if request.form.get('disable_redaction'):
+ cmd.append('--disable-redaction')
+ env = os.environ.copy()
+ env["PYTHONPATH"] = os.path.abspath(os.path.dirname(__file__))
+ job_id = str(uuid.uuid4())
+ jobs[job_id] = {'progress': 0, 'log': '', 'done': False, 'audio_url': f"/audio/{filename}"}
+ t = threading.Thread(target=run_tts_with_progress, args=(cmd, env, job_id))
+ t.start()
+ return jsonify({'job_id': job_id})
+ # GET request
+ return render_template_string(HTML)
+
+@app.route('/progress/')
+def progress(job_id):
+ job = jobs.get(job_id)
+ if not job:
+ return jsonify({'progress': 0, 'log': '', 'done': True, 'audio_url': None})
+ resp = {
+ 'progress': job['progress'],
+ 'log': job['log'],
+ 'done': job['done'],
+ 'audio_url': job['audio_url'] if job['done'] else None
+ }
+ return jsonify(resp)
+
+@app.route('/audio/')
+def audio(filename):
+ return send_file(os.path.join(OUTPUT_FOLDER, filename), as_attachment=False)
+
+if __name__ == '__main__':
+ app.run(debug=True, port=5000)
diff --git a/requirements-openui.txt b/requirements-openui.txt
new file mode 100644
index 0000000..e3e9a71
--- /dev/null
+++ b/requirements-openui.txt
@@ -0,0 +1 @@
+Flask
diff --git a/requirements.txt b/requirements.txt
index fd8d538..33437b8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,7 +18,6 @@ appdirs
nbconvert==5.3.1
tornado==4.2
pydantic==1.9.1
-deepspeed==0.8.3
py-cpuinfo
hjson
psutil
diff --git a/scripts/tortoise_tts.py b/scripts/tortoise_tts.py
index 932a780..89f2082 100755
--- a/scripts/tortoise_tts.py
+++ b/scripts/tortoise_tts.py
@@ -8,9 +8,8 @@ import time
import torch
import torchaudio
-
from tortoise.api import MODELS_DIR, TextToSpeech
-from tortoise.utils.audio import get_voices, load_voices, load_audio
+from tortoise.utils.audio import get_voices, load_audio, load_voices
from tortoise.utils.text import split_and_recombine_text
parser = argparse.ArgumentParser(
diff --git a/tortoise/api.py b/tortoise/api.py
index 8a010c2..b461633 100644
--- a/tortoise/api.py
+++ b/tortoise/api.py
@@ -1,12 +1,9 @@
import os
import random
-import uuid
from time import time
-from urllib import request
import torch
import torch.nn.functional as F
-import progressbar
import torchaudio
from tortoise.models.classifier import AudioMiniEncoderWithClassifierHead
diff --git a/tortoise/api_fast.py b/tortoise/api_fast.py
index fd7c590..d6838c0 100644
--- a/tortoise/api_fast.py
+++ b/tortoise/api_fast.py
@@ -1,29 +1,18 @@
import os
import random
-import uuid
from time import time
-from urllib import request
import torch
import torch.nn.functional as F
-import progressbar
-import torchaudio
-import numpy as np
from tortoise.models.classifier import AudioMiniEncoderWithClassifierHead
-from tortoise.models.diffusion_decoder import DiffusionTts
from tortoise.models.autoregressive import UnifiedVoice
-from tqdm import tqdm
from tortoise.models.arch_util import TorchMelSpectrogram
-from tortoise.models.clvp import CLVP
-from tortoise.models.cvvp import CVVP
from tortoise.models.hifigan_decoder import HifiganGenerator
from tortoise.models.random_latent_generator import RandomLatentConverter
-from tortoise.models.vocoder import UnivNetGenerator
-from tortoise.utils.audio import wav_to_univnet_mel, denormalize_tacotron_mel
+from tortoise.utils.audio import denormalize_tacotron_mel
from tortoise.utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule
from tortoise.utils.tokenizer import VoiceBpeTokenizer
from tortoise.utils.wav2vec_alignment import Wav2VecAlignment
-from contextlib import contextmanager
from tortoise.models.stream_generator import init_stream_support
from huggingface_hub import hf_hub_download
pbar = None
diff --git a/tortoise/models/transformer.py b/tortoise/models/transformer.py
index d16cbc9..29f6792 100644
--- a/tortoise/models/transformer.py
+++ b/tortoise/models/transformer.py
@@ -1,9 +1,7 @@
-from functools import partial
import torch
import torch.nn.functional as F
from einops import rearrange
-from rotary_embedding_torch import RotaryEmbedding
from torch import nn
diff --git a/tortoise/tts_stream.py b/tortoise/tts_stream.py
index 94eaff5..e16307e 100644
--- a/tortoise/tts_stream.py
+++ b/tortoise/tts_stream.py
@@ -3,7 +3,6 @@ import os
from time import time
import torch
-import torchaudio
from api_fast import TextToSpeech, MODELS_DIR
from utils.audio import load_audio, load_voices
diff --git a/tortoise/utils/wav2vec_alignment.py b/tortoise/utils/wav2vec_alignment.py
index adc39e3..9a58262 100644
--- a/tortoise/utils/wav2vec_alignment.py
+++ b/tortoise/utils/wav2vec_alignment.py
@@ -1,10 +1,8 @@
-import re
import torch
import torchaudio
-from transformers import Wav2Vec2ForCTC, Wav2Vec2FeatureExtractor, Wav2Vec2CTCTokenizer, Wav2Vec2Processor
+from transformers import Wav2Vec2ForCTC, Wav2Vec2FeatureExtractor, Wav2Vec2CTCTokenizer
-from tortoise.utils.audio import load_audio
def max_alignment(s1, s2, skip_character='~', record=None):