diff --git a/tortoise/tts_stream.py b/tortoise/tts_stream.py index 32a129d..9b8df6a 100644 --- a/tortoise/tts_stream.py +++ b/tortoise/tts_stream.py @@ -11,13 +11,19 @@ from utils.text import split_and_recombine_text import sounddevice as sd import queue import threading +import pydub # Using pydub for audio playback + + def play_audio(audio_queue): - while True: - chunk = audio_queue.get() - if chunk is None: - break - sd.play(chunk.cpu().numpy(), samplerate=24000) - sd.wait() + while True: + chunk = audio_queue.get() + if chunk is None: + break + # Convert PyTorch tensor to NumPy array and then to a WAV audio segment + audio_data = chunk.cpu().numpy().tobytes() + sound = pydub.AudioSegment(audio_data, frame_rate=24000, channels=1, sample_width=2) + # Play the audio segment + pydub.playback.play(sound) if __name__ == '__main__': parser = argparse.ArgumentParser()