From ec6c7ba25a5dbea958a5179a727fb4b1001808a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Anh=20L=C3=AA=20B=E1=BA=A3o?= Date: Fri, 30 May 2025 04:17:14 +0700 Subject: [PATCH] Refactor code structure for improved readability and maintainability --- openui_app.py | 337 ++++++++++++++++++++++++++++ requirements-openui.txt | 1 + requirements.txt | 1 - scripts/tortoise_tts.py | 3 +- tortoise/api.py | 3 - tortoise/api_fast.py | 13 +- tortoise/models/transformer.py | 2 - tortoise/tts_stream.py | 1 - tortoise/utils/wav2vec_alignment.py | 4 +- 9 files changed, 341 insertions(+), 24 deletions(-) create mode 100644 openui_app.py create mode 100644 requirements-openui.txt diff --git a/openui_app.py b/openui_app.py new file mode 100644 index 0000000..72b8fe2 --- /dev/null +++ b/openui_app.py @@ -0,0 +1,337 @@ +from flask import Flask, request, render_template_string, send_file, jsonify +import os +import uuid +import threading + +app = Flask(__name__) + +UPLOAD_FOLDER = 'uploads' +OUTPUT_FOLDER = 'outputs' +os.makedirs(UPLOAD_FOLDER, exist_ok=True) +os.makedirs(OUTPUT_FOLDER, exist_ok=True) + +# Global job storage +jobs = {} +processes = {} # Track running processes for stop functionality + +HTML = ''' + + + + + Tortoise TTS OpenUI + + + +
+

Tortoise TTS OpenUI

+
+
+
+ + + + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+
+
+ + +
+ + +
+ +
+
+
+
+ + + +''' + +def run_tts_with_progress(cmd, env, job_id): + import subprocess + import re + jobs[job_id]['progress'] = 0 + jobs[job_id]['log'] = '' + jobs[job_id]['done'] = False + process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env, text=True, bufsize=1) + processes[job_id] = process + try: + for line in process.stdout: + jobs[job_id]['log'] += line + match = re.search(r"\((\d+) of (\d+)\)", line) + if match: + current, total_clips = int(match.group(1)), int(match.group(2)) + progress = int(current / total_clips * 100) + jobs[job_id]['progress'] = progress + process.wait() + finally: + jobs[job_id]['progress'] = 100 + jobs[job_id]['done'] = True + processes.pop(job_id, None) + +@app.route('/stop/', methods=['POST']) +def stop_job(job_id): + proc = processes.get(job_id) + if proc and proc.poll() is None: + try: + proc.terminate() + try: + proc.wait(timeout=3) + except Exception: + proc.kill() + except Exception: + pass + jobs[job_id]['done'] = True + return jsonify({'stopped': True}) + +@app.route('/', methods=['GET', 'POST']) +def index(): + if request.method == 'POST': + text = request.form.get('text', '') + # Handle uploaded text file + if 'text_file' in request.files and request.files['text_file']: + file = request.files['text_file'] + if file.filename: + file_content = file.read().decode('utf-8') + text = file_content + if text: + filename = f"{uuid.uuid4()}.wav" + output_path = os.path.join(OUTPUT_FOLDER, filename) + cmd = [ + 'python', 'scripts/tortoise_tts.py', + '-o', output_path + ] + # Add text + if text: + cmd.append(text) + # Optional arguments + voice = request.form.get('voice') + if voice: + cmd.extend(['-v', voice]) + preset = request.form.get('preset') + if preset: + cmd.extend(['-p', preset]) + candidates = request.form.get('candidates') + if candidates and str(candidates) != '1': + cmd.extend(['--candidates', str(candidates)]) + seed = request.form.get('seed') + if seed: + cmd.extend(['--seed', str(seed)]) + device = request.form.get('device') + if device: + cmd.extend(['--device', device]) + voices_dir = request.form.get('voices_dir') + if voices_dir: + cmd.extend(['-V', voices_dir]) + text_split = request.form.get('text_split') + if text_split: + cmd.extend(['--text-split', text_split]) + # Tuning options + if request.form.get('num_autoregressive_samples'): + cmd.extend(['--num-autoregressive-samples', str(request.form['num_autoregressive_samples'])]) + if request.form.get('temperature'): + cmd.extend(['--temperature', str(request.form['temperature'])]) + if request.form.get('length_penalty'): + cmd.extend(['--length-penalty', str(request.form['length_penalty'])]) + if request.form.get('repetition_penalty'): + cmd.extend(['--repetition-penalty', str(request.form['repetition_penalty'])]) + if request.form.get('top_p'): + cmd.extend(['--top-p', str(request.form['top_p'])]) + if request.form.get('max_mel_tokens'): + cmd.extend(['--max-mel-tokens', str(request.form['max_mel_tokens'])]) + if request.form.get('cvvp_amount'): + cmd.extend(['--cvvp-amount', str(request.form['cvvp_amount'])]) + if request.form.get('diffusion_iterations'): + cmd.extend(['--diffusion-iterations', str(request.form['diffusion_iterations'])]) + if request.form.get('cond_free'): + cmd.append('--cond-free') + if request.form.get('cond_free_k'): + cmd.extend(['--cond-free-k', str(request.form['cond_free_k'])]) + if request.form.get('diffusion_temperature'): + cmd.extend(['--diffusion-temperature', str(request.form['diffusion_temperature'])]) + if request.form.get('quiet'): + cmd.append('-q') + if request.form.get('produce_debug_state'): + cmd.append('--produce-debug-state') + if request.form.get('skip_existing'): + cmd.append('--skip-existing') + if request.form.get('disable_redaction'): + cmd.append('--disable-redaction') + env = os.environ.copy() + env["PYTHONPATH"] = os.path.abspath(os.path.dirname(__file__)) + job_id = str(uuid.uuid4()) + jobs[job_id] = {'progress': 0, 'log': '', 'done': False, 'audio_url': f"/audio/{filename}"} + t = threading.Thread(target=run_tts_with_progress, args=(cmd, env, job_id)) + t.start() + return jsonify({'job_id': job_id}) + # GET request + return render_template_string(HTML) + +@app.route('/progress/') +def progress(job_id): + job = jobs.get(job_id) + if not job: + return jsonify({'progress': 0, 'log': '', 'done': True, 'audio_url': None}) + resp = { + 'progress': job['progress'], + 'log': job['log'], + 'done': job['done'], + 'audio_url': job['audio_url'] if job['done'] else None + } + return jsonify(resp) + +@app.route('/audio/') +def audio(filename): + return send_file(os.path.join(OUTPUT_FOLDER, filename), as_attachment=False) + +if __name__ == '__main__': + app.run(debug=True, port=5000) diff --git a/requirements-openui.txt b/requirements-openui.txt new file mode 100644 index 0000000..e3e9a71 --- /dev/null +++ b/requirements-openui.txt @@ -0,0 +1 @@ +Flask diff --git a/requirements.txt b/requirements.txt index fd8d538..33437b8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,7 +18,6 @@ appdirs nbconvert==5.3.1 tornado==4.2 pydantic==1.9.1 -deepspeed==0.8.3 py-cpuinfo hjson psutil diff --git a/scripts/tortoise_tts.py b/scripts/tortoise_tts.py index 932a780..89f2082 100755 --- a/scripts/tortoise_tts.py +++ b/scripts/tortoise_tts.py @@ -8,9 +8,8 @@ import time import torch import torchaudio - from tortoise.api import MODELS_DIR, TextToSpeech -from tortoise.utils.audio import get_voices, load_voices, load_audio +from tortoise.utils.audio import get_voices, load_audio, load_voices from tortoise.utils.text import split_and_recombine_text parser = argparse.ArgumentParser( diff --git a/tortoise/api.py b/tortoise/api.py index 8a010c2..b461633 100644 --- a/tortoise/api.py +++ b/tortoise/api.py @@ -1,12 +1,9 @@ import os import random -import uuid from time import time -from urllib import request import torch import torch.nn.functional as F -import progressbar import torchaudio from tortoise.models.classifier import AudioMiniEncoderWithClassifierHead diff --git a/tortoise/api_fast.py b/tortoise/api_fast.py index fd7c590..d6838c0 100644 --- a/tortoise/api_fast.py +++ b/tortoise/api_fast.py @@ -1,29 +1,18 @@ import os import random -import uuid from time import time -from urllib import request import torch import torch.nn.functional as F -import progressbar -import torchaudio -import numpy as np from tortoise.models.classifier import AudioMiniEncoderWithClassifierHead -from tortoise.models.diffusion_decoder import DiffusionTts from tortoise.models.autoregressive import UnifiedVoice -from tqdm import tqdm from tortoise.models.arch_util import TorchMelSpectrogram -from tortoise.models.clvp import CLVP -from tortoise.models.cvvp import CVVP from tortoise.models.hifigan_decoder import HifiganGenerator from tortoise.models.random_latent_generator import RandomLatentConverter -from tortoise.models.vocoder import UnivNetGenerator -from tortoise.utils.audio import wav_to_univnet_mel, denormalize_tacotron_mel +from tortoise.utils.audio import denormalize_tacotron_mel from tortoise.utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule from tortoise.utils.tokenizer import VoiceBpeTokenizer from tortoise.utils.wav2vec_alignment import Wav2VecAlignment -from contextlib import contextmanager from tortoise.models.stream_generator import init_stream_support from huggingface_hub import hf_hub_download pbar = None diff --git a/tortoise/models/transformer.py b/tortoise/models/transformer.py index d16cbc9..29f6792 100644 --- a/tortoise/models/transformer.py +++ b/tortoise/models/transformer.py @@ -1,9 +1,7 @@ -from functools import partial import torch import torch.nn.functional as F from einops import rearrange -from rotary_embedding_torch import RotaryEmbedding from torch import nn diff --git a/tortoise/tts_stream.py b/tortoise/tts_stream.py index 94eaff5..e16307e 100644 --- a/tortoise/tts_stream.py +++ b/tortoise/tts_stream.py @@ -3,7 +3,6 @@ import os from time import time import torch -import torchaudio from api_fast import TextToSpeech, MODELS_DIR from utils.audio import load_audio, load_voices diff --git a/tortoise/utils/wav2vec_alignment.py b/tortoise/utils/wav2vec_alignment.py index adc39e3..9a58262 100644 --- a/tortoise/utils/wav2vec_alignment.py +++ b/tortoise/utils/wav2vec_alignment.py @@ -1,10 +1,8 @@ -import re import torch import torchaudio -from transformers import Wav2Vec2ForCTC, Wav2Vec2FeatureExtractor, Wav2Vec2CTCTokenizer, Wav2Vec2Processor +from transformers import Wav2Vec2ForCTC, Wav2Vec2FeatureExtractor, Wav2Vec2CTCTokenizer -from tortoise.utils.audio import load_audio def max_alignment(s1, s2, skip_character='~', record=None):