add tokenizer

This commit is contained in:
Roberts Slisans 2023-08-15 11:27:43 +03:00
parent 4003544b6f
commit 72eccabcb7
3 changed files with 16 additions and 10 deletions

View file

@ -6,7 +6,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
setuptools.setup(
name="TorToiSe",
packages=setuptools.find_packages(),
version="2.7.0",
version="2.8.0",
author="James Betker",
author_email="james@adamant.ai",
description="A high quality multi-voice text-to-speech library",

View file

@ -207,7 +207,9 @@ class TextToSpeech:
"""
def __init__(self, autoregressive_batch_size=None, models_dir=MODELS_DIR,
enable_redaction=True, kv_cache=False, use_deepspeed=False, half=False, device=None):
enable_redaction=True, kv_cache=False, use_deepspeed=False, half=False, device=None,
tokenizer_vocab_file=None, tokenizer_basic=False):
"""
Constructor
:param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
@ -228,7 +230,10 @@ class TextToSpeech:
if self.enable_redaction:
self.aligner = Wav2VecAlignment()
self.tokenizer = VoiceBpeTokenizer()
self.tokenizer = VoiceBpeTokenizer(
vocab_file=tokenizer_vocab_file,
use_basic_cleaners=tokenizer_basic,
)
self.half = half
if os.path.exists(f'{models_dir}/autoregressive.ptt'):
# Assume this is a traced directory.

View file

@ -170,13 +170,14 @@ DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), '
class VoiceBpeTokenizer:
def __init__(self, vocab_file=DEFAULT_VOCAB_FILE):
if vocab_file is not None:
self.tokenizer = Tokenizer.from_file(vocab_file)
def preprocess_text(self, txt):
txt = english_cleaners(txt)
return txt
def __init__(self, vocab_file=None, use_basic_cleaners=False):
self.tokenizer = Tokenizer.from_file(
DEFAULT_VOCAB_FILE if vocab_file is None else vocab_file
)
if use_basic_cleaners:
self.preprocess_text = basic_cleaners
else:
self.preprocess_text = english_cleaners
def encode(self, txt):
txt = self.preprocess_text(txt)