2022-04-11 03:29:42 +02:00
import argparse
import os
2022-05-17 20:11:18 +02:00
from time import time
2022-04-11 03:29:42 +02:00
import torch
import torchaudio
2022-05-19 13:31:02 +02:00
from api import TextToSpeech , MODELS_DIR
from utils . audio import load_audio , load_voices
2022-05-13 13:02:17 +02:00
from utils . text import split_and_recombine_text
2022-04-11 03:29:42 +02:00
2022-04-22 00:06:43 +02:00
2022-04-11 03:29:42 +02:00
if __name__ == ' __main__ ' :
parser = argparse . ArgumentParser ( )
2022-05-03 04:56:28 +02:00
parser . add_argument ( ' --textfile ' , type = str , help = ' A file containing the text to read. ' , default = " tortoise/data/riding_hood.txt " )
2022-04-15 16:26:11 +02:00
parser . add_argument ( ' --voice ' , type = str , help = ' Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
2022-04-26 04:07:07 +02:00
' Use the & character to join two voices together. Use a comma to perform inference on multiple voices. ' , default = ' pat ' )
2022-05-03 05:37:39 +02:00
parser . add_argument ( ' --output_path ' , type = str , help = ' Where to store outputs. ' , default = ' results/longform/ ' )
2023-04-17 05:52:03 +02:00
parser . add_argument ( ' --output_name ' , type = str , help = ' How to name the output file ' , default = ' combined.wav ' )
2022-04-21 01:24:09 +02:00
parser . add_argument ( ' --preset ' , type = str , help = ' Which voice preset to use. ' , default = ' standard ' )
2022-04-26 04:05:21 +02:00
parser . add_argument ( ' --regenerate ' , type = str , help = ' Comma-separated list of clip numbers to re-generate, or nothing. ' , default = None )
2022-05-22 13:26:01 +02:00
parser . add_argument ( ' --candidates ' , type = int , help = ' How many output candidates to produce per-voice. Only the first candidate is actually used in the final product, the others can be used manually. ' , default = 1 )
2022-05-02 01:29:25 +02:00
parser . add_argument ( ' --model_dir ' , type = str , help = ' Where to find pretrained model checkpoints. Tortoise automatically downloads these to .models, so this '
2022-05-19 18:37:57 +02:00
' should only be specified if you have custom checkpoints. ' , default = MODELS_DIR )
2022-05-17 20:11:18 +02:00
parser . add_argument ( ' --seed ' , type = int , help = ' Random seed which can be used to reproduce results. ' , default = None )
parser . add_argument ( ' --produce_debug_state ' , type = bool , help = ' Whether or not to produce debug_state.pth, which can aid in reproducing problems. Defaults to true. ' , default = True )
2023-07-29 02:56:03 +02:00
parser . add_argument ( ' --use_deepspeed ' , type = bool , help = ' Use deepspeed for speed bump. ' , default = True )
parser . add_argument ( ' --kv_cache ' , type = bool , help = ' If you disable this please wait for a long a time to get the output ' , default = True )
2023-07-30 09:32:50 +02:00
parser . add_argument ( ' --half ' , type = bool , help = " float16(half) precision inference if True it ' s faster and take less vram and ram " , default = True )
2023-07-29 02:56:03 +02:00
2022-05-17 20:11:18 +02:00
2022-04-11 03:29:42 +02:00
args = parser . parse_args ( )
2023-07-29 02:56:03 +02:00
tts = TextToSpeech ( models_dir = args . model_dir , use_deepspeed = args . use_deepspeed , kv_cache = args . kv_cache , half = args . half )
2022-04-11 03:29:42 +02:00
2022-04-15 16:26:11 +02:00
outpath = args . output_path
2023-04-17 05:52:03 +02:00
outname = args . output_name
2022-04-15 16:26:11 +02:00
selected_voices = args . voice . split ( ' , ' )
2022-04-26 04:05:21 +02:00
regenerate = args . regenerate
if regenerate is not None :
regenerate = [ int ( e ) for e in regenerate . split ( ' , ' ) ]
2022-05-02 01:29:25 +02:00
2022-05-12 19:24:55 +02:00
# Process text
with open ( args . textfile , ' r ' , encoding = ' utf-8 ' ) as f :
text = ' ' . join ( [ l for l in f . readlines ( ) ] )
if ' | ' in text :
print ( " Found the ' | ' character in your text, which I will use as a cue for where to split it up. If this was not "
" your intent, please remove all ' | ' characters from the input. " )
texts = text . split ( ' | ' )
else :
texts = split_and_recombine_text ( text )
2022-05-17 20:11:18 +02:00
seed = int ( time ( ) ) if args . seed is None else args . seed
2022-04-15 16:26:11 +02:00
for selected_voice in selected_voices :
voice_outpath = os . path . join ( outpath , selected_voice )
os . makedirs ( voice_outpath , exist_ok = True )
if ' & ' in selected_voice :
voice_sel = selected_voice . split ( ' & ' )
else :
voice_sel = [ selected_voice ]
2022-04-11 03:29:42 +02:00
2022-05-02 01:25:18 +02:00
voice_samples , conditioning_latents = load_voices ( voice_sel )
2022-04-21 23:19:36 +02:00
all_parts = [ ]
2022-04-15 16:26:11 +02:00
for j , text in enumerate ( texts ) :
2022-04-26 04:05:21 +02:00
if regenerate is not None and j not in regenerate :
all_parts . append ( load_audio ( os . path . join ( voice_outpath , f ' { j } .wav ' ) , 24000 ) )
continue
2022-05-02 01:25:18 +02:00
gen = tts . tts_with_preset ( text , voice_samples = voice_samples , conditioning_latents = conditioning_latents ,
2022-05-22 13:26:01 +02:00
preset = args . preset , k = args . candidates , use_deterministic_seed = seed )
if args . candidates == 1 :
2023-07-29 02:21:56 +02:00
audio_ = gen . squeeze ( 0 ) . cpu ( )
torchaudio . save ( os . path . join ( voice_outpath , f ' { j } .wav ' ) , audio_ , 24000 )
2022-05-22 13:26:01 +02:00
else :
candidate_dir = os . path . join ( voice_outpath , str ( j ) )
os . makedirs ( candidate_dir , exist_ok = True )
for k , g in enumerate ( gen ) :
torchaudio . save ( os . path . join ( candidate_dir , f ' { k } .wav ' ) , g . squeeze ( 0 ) . cpu ( ) , 24000 )
2023-07-29 02:21:56 +02:00
audio_ = gen [ 0 ] . squeeze ( 0 ) . cpu ( )
all_parts . append ( audio_ )
2022-05-17 20:11:18 +02:00
2022-06-05 00:47:29 +02:00
if args . candidates == 1 :
full_audio = torch . cat ( all_parts , dim = - 1 )
2023-04-17 05:52:03 +02:00
torchaudio . save ( os . path . join ( voice_outpath , f " { outname } .wav " ) , full_audio , 24000 )
2022-04-11 03:29:42 +02:00
2022-05-17 20:11:18 +02:00
if args . produce_debug_state :
os . makedirs ( ' debug_states ' , exist_ok = True )
dbg_state = ( seed , texts , voice_samples , conditioning_latents )
torch . save ( dbg_state , f ' debug_states/read_debug_ { selected_voice } .pth ' )
2022-06-06 22:13:29 +02:00
# Combine each candidate's audio clips.
if args . candidates > 1 :
audio_clips = [ ]
for candidate in range ( args . candidates ) :
for line in range ( len ( texts ) ) :
wav_file = os . path . join ( voice_outpath , str ( line ) , f " { candidate } .wav " )
audio_clips . append ( load_audio ( wav_file , 24000 ) )
audio_clips = torch . cat ( audio_clips , dim = - 1 )
2023-04-17 05:52:03 +02:00
torchaudio . save ( os . path . join ( voice_outpath , f " { outname } _ { candidate : 02d } .wav " ) , audio_clips , 24000 )
2022-06-06 22:13:29 +02:00
audio_clips = [ ]