mirror of
https://github.com/neonbjb/tortoise-tts.git
synced 2026-03-17 18:54:39 +01:00
add tokenizer
This commit is contained in:
parent
4003544b6f
commit
72eccabcb7
2
setup.py
2
setup.py
|
|
@ -6,7 +6,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|||
setuptools.setup(
|
||||
name="TorToiSe",
|
||||
packages=setuptools.find_packages(),
|
||||
version="2.7.0",
|
||||
version="2.8.0",
|
||||
author="James Betker",
|
||||
author_email="james@adamant.ai",
|
||||
description="A high quality multi-voice text-to-speech library",
|
||||
|
|
|
|||
|
|
@ -207,7 +207,9 @@ class TextToSpeech:
|
|||
"""
|
||||
|
||||
def __init__(self, autoregressive_batch_size=None, models_dir=MODELS_DIR,
|
||||
enable_redaction=True, kv_cache=False, use_deepspeed=False, half=False, device=None):
|
||||
enable_redaction=True, kv_cache=False, use_deepspeed=False, half=False, device=None,
|
||||
tokenizer_vocab_file=None, tokenizer_basic=False):
|
||||
|
||||
"""
|
||||
Constructor
|
||||
:param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
|
||||
|
|
@ -228,7 +230,10 @@ class TextToSpeech:
|
|||
if self.enable_redaction:
|
||||
self.aligner = Wav2VecAlignment()
|
||||
|
||||
self.tokenizer = VoiceBpeTokenizer()
|
||||
self.tokenizer = VoiceBpeTokenizer(
|
||||
vocab_file=tokenizer_vocab_file,
|
||||
use_basic_cleaners=tokenizer_basic,
|
||||
)
|
||||
self.half = half
|
||||
if os.path.exists(f'{models_dir}/autoregressive.ptt'):
|
||||
# Assume this is a traced directory.
|
||||
|
|
|
|||
|
|
@ -170,13 +170,14 @@ DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), '
|
|||
|
||||
|
||||
class VoiceBpeTokenizer:
|
||||
def __init__(self, vocab_file=DEFAULT_VOCAB_FILE):
|
||||
if vocab_file is not None:
|
||||
self.tokenizer = Tokenizer.from_file(vocab_file)
|
||||
|
||||
def preprocess_text(self, txt):
|
||||
txt = english_cleaners(txt)
|
||||
return txt
|
||||
def __init__(self, vocab_file=None, use_basic_cleaners=False):
|
||||
self.tokenizer = Tokenizer.from_file(
|
||||
DEFAULT_VOCAB_FILE if vocab_file is None else vocab_file
|
||||
)
|
||||
if use_basic_cleaners:
|
||||
self.preprocess_text = basic_cleaners
|
||||
else:
|
||||
self.preprocess_text = english_cleaners
|
||||
|
||||
def encode(self, txt):
|
||||
txt = self.preprocess_text(txt)
|
||||
|
|
|
|||
Loading…
Reference in a new issue