tortoise-tts/tortoise_tts.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "_pIZ3ZXNp7cf"
   },
   "source": [
    "Welcome to Tortoise! 🐢🐢🐢🐢\n",
    "\n",
    "Before you begin, I **strongly** recommend you turn on a GPU runtime.\n",
    "\n",
    "There's a reason this is called \"Tortoise\" - this model takes up to a minute to perform inference for a single sentence on a GPU. Expect waits on the order of hours on a CPU."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "JrK20I32grP6"
   },
   "outputs": [],
   "source": [
    "#first follow the instructions in the README.md file under Local Installation\n",
    "!pip3 install -r requirements.txt\n",
    "# !python3 setup.py install"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "Gen09NM4hONQ"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/manmay/anaconda3/envs/tortoise/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "# Imports used through the rest of the notebook.\n",
    "import torch\n",
    "import torchaudio\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "\n",
    "import IPython\n",
    "\n",
    "from tortoise.api import TextToSpeech\n",
    "from tortoise.utils.audio import load_audio, load_voice, load_voices\n",
    "\n",
    "# This will download all the models used by Tortoise from the HF hub.\n",
    "tts = TextToSpeech()\n",
    "# If you want to use deepspeed the pass use_deepspeed=True nearly 2x faster than normal\n",
    "# tts = TextToSpeech(use_deepspeed=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "bt_aoxONjfL2"
   },
   "outputs": [],
   "source": [
    "# This is the text that will be spoken.\n",
    "text = \"Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?\"\n",
    "\n",
    "# Here's something for the poetically inclined.. (set text=)\n",
    "\"\"\"\n",
    "Then took the other, as just as fair,\n",
    "And having perhaps the better claim,\n",
    "Because it was grassy and wanted wear;\n",
    "Though as for that the passing there\n",
    "Had worn them really about the same,\"\"\"\n",
    "\n",
    "# Pick a \"preset mode\" to determine quality. Options: {\"ultra_fast\", \"fast\" (default), \"standard\", \"high_quality\"}. See docs in api.py\n",
    "preset = \"ultra_fast\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "id": "SSleVnRAiEE2"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0m\u001b[01;34mangie\u001b[0m/                \u001b[01;34mfreeman\u001b[0m/  \u001b[01;34mmyself\u001b[0m/        \u001b[01;34mtom\u001b[0m/            \u001b[01;34mtrain_grace\u001b[0m/\n",
      "\u001b[01;34mapplejack\u001b[0m/            \u001b[01;34mgeralt\u001b[0m/   \u001b[01;34mpat\u001b[0m/           \u001b[01;34mtrain_atkins\u001b[0m/   \u001b[01;34mtrain_kennard\u001b[0m/\n",
      "\u001b[01;34mcond_latent_example\u001b[0m/  \u001b[01;34mhalle\u001b[0m/    \u001b[01;34mpat2\u001b[0m/          \u001b[01;34mtrain_daws\u001b[0m/     \u001b[01;34mtrain_lescault\u001b[0m/\n",
      "\u001b[01;34mdaniel\u001b[0m/               \u001b[01;34mjlaw\u001b[0m/     \u001b[01;34mrainbow\u001b[0m/       \u001b[01;34mtrain_dotrice\u001b[0m/  \u001b[01;34mtrain_mouse\u001b[0m/\n",
      "\u001b[01;34mdeniro\u001b[0m/               \u001b[01;34mlj\u001b[0m/       \u001b[01;34msnakes\u001b[0m/        \u001b[01;34mtrain_dreams\u001b[0m/   \u001b[01;34mweaver\u001b[0m/\n",
      "\u001b[01;34memma\u001b[0m/                 \u001b[01;34mmol\u001b[0m/      \u001b[01;34mtim_reynolds\u001b[0m/  \u001b[01;34mtrain_empire\u001b[0m/   \u001b[01;34mwilliam\u001b[0m/\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "                <audio  controls=\"controls\" >\n",
       "                    <source src=\"data:audio/x-wav;base64,UklGRrCcBwBXQVZFZm10IBAAAAABAAEAIlYAAESsAAACABAAZGF0YYycBwAjAPD/xf+9//b/8v/M/5b/GgA2AMj//P/B/+b/xf+k/8n/+/8hAND/2P/6/1YAYQAkAO//KgApAN3/i/98/wAA2v/q//n/FgBFANn/uP+f/+T/7P/G/wwAAAAwABoAIgBKACsAPwAyABwA4P/7/+//EwBHAPf/AAAiADkA8v/l/zgAIADm/9T//f8eAAsACQA3ACAAZgBbAC0AIwBgAD8A8P8xAD4AKADJ/x4ASgAvAO7/3P8MANr//v8EAAYAHwAUAAAA+P/q/w8AHwDS/9P/IwAdANf/yv/i/+3/2v+9/8D/zf/I/7n/sv+P/8D/z//9/xIA0//U//X/+v/J/6r/sf8PAOb/2v/O/wkAEwARANH/oP/e/8//z/9y//3/KwAJAO////9ZAE8AIgD3/+X/8v9HAAYA1/8DAFQAawAXAAoA/P8rABAAw/+h/7j/FwD1/8H/xv9BAF4Ayv9w/9z/JAAhAA8A1P8MABQATAAYAPj/MgBFAEQA8//N/zQAGgDe/9r/xf8PABwABQDU//P/CgDs/7n/6P/V/6X/AAANADUAtP/W/xQAJgDz/6z/mP+y/8z/2/+Y/4P/6P/8/4b/N/+h/9P/x/+6/6D/1v8FABYAxv/q/14AVQAGAL//BQAhAOT/AAAFAN3/CgAdANT/yf8YAEcA9//H/xAASAA7AA8AKABfAIQAUwAMABIAaACHAD8A+P8hAG8AZQAMAMP/AgBSABIAyP+7/+n/NQAQAOn/0/8wAIkAHwC6//r/cABbADwA6f8vABkAFgAOALP/x/8CACMA3f+f/+T/JQAWAOP/7/8SAB8AMgDv/+n/HQAxAE8AEgAeAFIASQDf/+b/LwAbAPf/2/8WABIA3f/C/6f/5//4/+7/6f/T/woAAgDP/6b/8v8wACwA8f/m/xQAOAAoAMz/9/8NABYAJwD9/+b/6/8LABQAwf/b/xYA8f+0/8r/+//5/wEADgBZAE4ASQA2ADEAGQATAB8ACQALACUAAgDV//D/+f8CAO//0f/r/yMA7f+9/9n/FQATAP7/CwA9AEYAFgAhADQALAAcACEA2P/y/0gAMABJABkAMgB/AD4AJgA1AD8ALAAmACYAJABbADoAQgA7ADcASgARAAIA9v/q//n/BwD1/x8AOQBBAB8AFgA5ADkAKgAJABMAHAD7//z/KwD//xoA7/++/7j/w//h/6b/ov+3/87/yv/F/7n/8P/N/8r/u//s/wEA+//6/+3/FAAfABAA///w/9P/DAAiAMP/xv/j//z/7P+s/8r/4//Z/8P/pP/t//X/4P+y/w8ACQD4/0gAIwAhABYAPgAvAPX/HQAyACAA+P8JADUADwAIAPH/GQALAPf//P/p/x8AFAApAP3/9P8ZADQANAAgACMAHAA5ACoA9v/5/yEAEQDN/6L/zf/k//7/uP/B/8D/2P/G/6D/3v+8/6P/tv/h/73/0v/j/+j/4/+n/9T/5P/s/woACwDu/wkA+P/P/+T/+f/a/wYA8P/b/9n/6f8CAOr/FgD8/xgA6f/h/yMAFgAFAOX/+/////7/5v/j/wUAHQAHAND/9v8ZAAkACgDx/yUAAADl/xkAAgAjABQAEQAAAAgADQDW/woAEQAFAA8A9f8hAPz/FwAtABMAAwDy//z/4P/K//L/KQAZABQAFgAQACwAJQD3/w4ABgAbACkABgAlABUA1f/Q//z/9//l//X/8P/0/+b/3f/g/9v/4v8AAOD/0//4/w0A6P/g/+z/DQD3/wUAIgAXAEAAIwAdABwAAQAPABsAGwD9//3/KgARAAMAAwALANv/5//8/9P/3P/q/wwA9P/p//P/CADx/93/7P/h/+D/9P/k/wIAIAAdADkABQAOAAwACAAEAAAA5v8IAAwA9/8mAB0AAQDV//L/7f/e/8z/+P8KAPL/CQDk/xYAIADv/w0AOwAgABUACQD7/9j/6//6//L/BgD4/xoAGgAPABkAEwAOAAkAFwAbABAA//8LAP3/EAAFABEAAQDb/+T/3f/3/xgA7v/8/w0A7//M/wsAGAAJACkAHgAfACEAFAAYAAwACQAfADIAKwBAAC4AHQAuADIALgArADUAQQAcAB8APAARAA4AFgAVAA4A5f8eAEEAHgAwABkAHwD5/+n/AgD3//7/AQD5/+z/AADq/wAAAQDr/+v//f8HAAwA9f/3/wMA9P/7/+f/AgD3//7/7f/q/yQAEwD2//X/EAAQABQA6v8GABEAIgAwABUAJgATACkAAgAmAC4AOwApABsAQAAbAEEASQArACAANAAOAEIAHAA4AEAADwAWAPv/9/8IABMAFQAYAPf/9//m//T/7//b/9T/wf/A/5j/rv+2/67/kP9//7f/m/+n/6//pv+2/83/wP/F/8n/zP/r/93/zP/D//T/8P/g//X/8f8SABIA+f8DACAAIgARAAAAFgAiABkAJwAgABYAGgAZAAcADgD5/wUABQD0/+z/5//v/+v/8v/j/+n/8f/p/9v/4P8CAPT/7//D/8z/5v/Q/+H/2//i/+D/8v/3/9T/4//c/+n/2v/f/+D/9v8BAAMAz//S//X/yP/U//P/+//y/+//5P/4//z/BgAgACgAMgAgACgACwAaAA4AJAAjAB4AJgA6ADsAIAAlAP//NQDy/+7/CQD4/ygA6v/r/xAABQD3/xUAIAAaABkACwAOAPH/AQDu/+3/BwAZAB8AEQAfAP3/+P///wQA//////T/AwD///r/5f8TABcACgADANn/AwAAAAAACgAMABoA///3//z/IgALAAgACwAZABYAIQAZAAIA+P/y/xAA/v8iABsADgAbAAcAHAAVAPf/FQD8//X/7//c/9n/3v/0/+j/BADz/wUA8v/v/+H/9P/5/83/5//u/+7/3/8LAPn/DAACAP3/5P/u/xwAEgAwAAEACAD8//b/EwASAPv/8f8EAAQACQDu/wUA5v/4/wAAAQAYAAoA+v8BABUAEQAZAAgACgD+/wUAIAALAO7//P8EAAMACQD4/wsAHAAFAPv/7v8CAAQAEAACAP3/+f/9//7/BwALAAIADgAMAAcA5//+//n/AgAMAAIAEwADACkA/P8fABAA8v/r/8P/7//W/+H/6f8BAP3/1v8BAAAA+v8DAAIA7f/S/+7/7P/6/xIAGQAOAB4AFwAAACoAIAAtAB0AEQAdACEAJgAgADwALAA7ADUALQAFACMAMwAlADMAIgBCADQAGQASABUACgDu//z/DgDz/x8AEQAPAAYA+f/1/87/3f/H//T/7f/i/9H/1P/o/+L/9v/w/xoADQAaAA0ADQAUAA8AJgAGAP7/BgAPADEAIgAnACIACQAdAAUAFgAWABAAFQAQAAQACgAAAA8A+f8SACsAAQAbAOr/8P/v/+T/4//Y/+X/1P/i//z/+//+/wsACgD9//P/BwD7/+f/5f8BAPr/1P/U/+j/+P/F/8r/yv/C/+T/9/8KAA0AEQD8/wIABQDr/+b/6P/v/93/7v/q/+v/+v/q/wsADADn/+r/9//x/9b/2//v/+z/5//0/wIADQATABkABAD9//n/8v/t/+b/5f/t/+T/3P/x/9r/4P/s/+P/5//d/7//0v/e/8D/0//l/wMA4v/r/wwAEgAEAOv/EADu/wAA/P8XAB0AFwAdAAAAEQDt//b/+f/v/+X/2P/T//f/0P/u/+r/6v/1/97/7//j//D/6f/m/+T//f8RABQAEwAlABMABQAiAA4AJAD9/w0ABwD1/wQAGQD//xgAHgD9/w0A/f/7/+3/7//1/w8AFgDk/w4A9P/1/xIA9f/8//7/HQADABMAKgAaACwAQAAlACsAIQARABgAEwA/AEAALAAoAD4AJAAhABgAJwAjAB4AIwATACoACwAqAB4AFgAfABAAEABCACgAIgAkAA8ABwAAAAMA8P/8//f//v/3/wQA8//q/+r/9//3/9//8//4/
       "                    Your browser does not support the audio element.\n",
       "                </audio>\n",
       "              "
      ],
      "text/plain": [
       "<IPython.lib.display.Audio object>"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Tortoise will attempt to mimic voices you provide. It comes pre-packaged\n",
    "# with some voices you might recognize.\n",
    "\n",
    "# Let's list all the voices available. These are just some random clips I've gathered\n",
    "# from the internet as well as a few voices from the training dataset.\n",
    "# Feel free to add your own clips to the voices/ folder.\n",
    "%ls tortoise/voices\n",
    "\n",
    "IPython.display.Audio('tortoise/voices/tom/1.wav')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "id": "KEXOKjIvn6NW"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generating autoregressive samples..\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 38%|███▊      | 6/16 [00:31<00:52,  5.20s/it]\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[4], line 6\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[39m# Load it and send it through Tortoise.\u001b[39;00m\n\u001b[1;32m      5\u001b[0m voice_samples, conditioning_latents \u001b[39m=\u001b[39m load_voice(voice)\n\u001b[0;32m----> 6\u001b[0m gen \u001b[39m=\u001b[39m tts\u001b[39m.\u001b[39;49mtts_with_preset(text, voice_samples\u001b[39m=\u001b[39;49mvoice_samples, conditioning_latents\u001b[39m=\u001b[39;49mconditioning_latents, \n\u001b[1;32m      7\u001b[0m                           preset\u001b[39m=\u001b[39;49mpreset)\n\u001b[1;32m      8\u001b[0m torchaudio\u001b[39m.\u001b[39msave(\u001b[39m'\u001b[39m\u001b[39mgenerated.wav\u001b[39m\u001b[39m'\u001b[39m, gen\u001b[39m.\u001b[39msqueeze(\u001b[39m0\u001b[39m)\u001b[39m.\u001b[39mcpu(), \u001b[39m24000\u001b[39m)\n\u001b[1;32m      9\u001b[0m IPython\u001b[39m.\u001b[39mdisplay\u001b[39m.\u001b[39mAudio(\u001b[39m'\u001b[39m\u001b[39mgenerated.wav\u001b[39m\u001b[39m'\u001b[39m)\n",
      "File \u001b[0;32m/data/speech_synth/tortoise-tts/tortoise/api.py:329\u001b[0m, in \u001b[0;36mTextToSpeech.tts_with_preset\u001b[0;34m(self, text, preset, **kwargs)\u001b[0m\n\u001b[1;32m    327\u001b[0m settings\u001b[39m.\u001b[39mupdate(presets[preset])\n\u001b[1;32m    328\u001b[0m settings\u001b[39m.\u001b[39mupdate(kwargs) \u001b[39m# allow overriding of preset settings with kwargs\u001b[39;00m\n\u001b[0;32m--> 329\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mtts(text, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49msettings)\n",
      "File \u001b[0;32m/data/speech_synth/tortoise-tts/tortoise/api.py:412\u001b[0m, in \u001b[0;36mTextToSpeech.tts\u001b[0;34m(self, text, voice_samples, conditioning_latents, k, verbose, use_deterministic_seed, return_deterministic_state, num_autoregressive_samples, temperature, length_penalty, repetition_penalty, top_p, max_mel_tokens, cvvp_amount, diffusion_iterations, cond_free, cond_free_k, diffusion_temperature, **hf_generate_kwargs)\u001b[0m\n\u001b[1;32m    410\u001b[0m     \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mGenerating autoregressive samples..\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m    411\u001b[0m \u001b[39mfor\u001b[39;00m b \u001b[39min\u001b[39;00m tqdm(\u001b[39mrange\u001b[39m(num_batches), disable\u001b[39m=\u001b[39m\u001b[39mnot\u001b[39;00m verbose):\n\u001b[0;32m--> 412\u001b[0m     codes \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mautoregressive\u001b[39m.\u001b[39;49minference_speech(auto_conditioning, text_tokens,\n\u001b[1;32m    413\u001b[0m                                                  do_sample\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m,\n\u001b[1;32m    414\u001b[0m                                                  top_p\u001b[39m=\u001b[39;49mtop_p,\n\u001b[1;32m    415\u001b[0m                                                  temperature\u001b[39m=\u001b[39;49mtemperature,\n\u001b[1;32m    416\u001b[0m                                                  num_return_sequences\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mautoregressive_batch_size,\n\u001b[1;32m    417\u001b[0m                                                  length_penalty\u001b[39m=\u001b[39;49mlength_penalty,\n\u001b[1;32m    418\u001b[0m                                                  repetition_penalty\u001b[39m=\u001b[39;49mrepetition_penalty,\n\u001b[1;32m    419\u001b[0m                                                  max_generate_length\u001b[39m=\u001b[39;49mmax_mel_tokens,\n\u001b[1;32m    420\u001b[0m                                                  \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mhf_generate_kwargs)\n\u001b[1;32m    421\u001b[0m     padding_needed \u001b[39m=\u001b[39m max_mel_tokens \u001b[39m-\u001b[39m codes\u001b[39m.\u001b[39mshape[\u001b[39m1\u001b[39m]\n\u001b[1;32m    422\u001b[0m     codes \u001b[39m=\u001b[39m F\u001b[39m.\u001b[39mpad(codes, (\u001b[39m0\u001b[39m, padding_needed), value\u001b[39m=\u001b[39mstop_mel_token)\n",
      "File \u001b[0;32m/data/speech_synth/tortoise-tts/tortoise/models/autoregressive.py:513\u001b[0m, in \u001b[0;36mUnifiedVoice.inference_speech\u001b[0;34m(self, speech_conditioning_latent, text_inputs, input_tokens, num_return_sequences, max_generate_length, typical_sampling, typical_mass, **hf_generate_kwargs)\u001b[0m\n\u001b[1;32m    511\u001b[0m logits_processor \u001b[39m=\u001b[39m LogitsProcessorList([TypicalLogitsWarper(mass\u001b[39m=\u001b[39mtypical_mass)]) \u001b[39mif\u001b[39;00m typical_sampling \u001b[39melse\u001b[39;00m LogitsProcessorList()\n\u001b[1;32m    512\u001b[0m max_length \u001b[39m=\u001b[39m trunc_index \u001b[39m+\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmax_mel_tokens \u001b[39m-\u001b[39m \u001b[39m1\u001b[39m  \u001b[39mif\u001b[39;00m max_generate_length \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m trunc_index \u001b[39m+\u001b[39m max_generate_length\n\u001b[0;32m--> 513\u001b[0m gen \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49minference_model\u001b[39m.\u001b[39;49mgenerate(inputs, bos_token_id\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mstart_mel_token, pad_token_id\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mstop_mel_token, eos_token_id\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mstop_mel_token,\n\u001b[1;32m    514\u001b[0m                                     max_length\u001b[39m=\u001b[39;49mmax_length, logits_processor\u001b[39m=\u001b[39;49mlogits_processor,\n\u001b[1;32m    515\u001b[0m                                     num_return_sequences\u001b[39m=\u001b[39;49mnum_return_sequences, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mhf_generate_kwargs)\n\u001b[1;32m    516\u001b[0m \u001b[39mreturn\u001b[39;00m gen[:, trunc_index:]\n",
      "File \u001b[0;32m~/anaconda3/envs/tortoise/lib/python3.9/site-packages/torch/utils/_contextlib.py:115\u001b[0m, in \u001b[0;36mcontext_decorator.<locals>.decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    112\u001b[0m \u001b[39m@functools\u001b[39m\u001b[39m.\u001b[39mwraps(func)\n\u001b[1;32m    113\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mdecorate_context\u001b[39m(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs):\n\u001b[1;32m    114\u001b[0m     \u001b[39mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 115\u001b[0m         \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
      "File \u001b[0;32m~/anaconda3/envs/tortoise/lib/python3.9/site-packages/transformers/generation_utils.py:1310\u001b[0m, in \u001b[0;36mGenerationMixin.generate\u001b[0;34m(self, inputs, max_length, min_length, do_sample, early_stopping, num_beams, temperature, top_k, top_p, typical_p, repetition_penalty, bad_words_ids, force_words_ids, bos_token_id, pad_token_id, eos_token_id, length_penalty, no_repeat_ngram_size, encoder_no_repeat_ngram_size, num_return_sequences, max_time, max_new_tokens, decoder_start_token_id, use_cache, num_beam_groups, diversity_penalty, prefix_allowed_tokens_fn, logits_processor, renormalize_logits, stopping_criteria, constraints, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, forced_bos_token_id, forced_eos_token_id, remove_invalid_values, synced_gpus, exponential_decay_length_penalty, **model_kwargs)\u001b[0m\n\u001b[1;32m   1302\u001b[0m     input_ids, model_kwargs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_expand_inputs_for_generation(\n\u001b[1;32m   1303\u001b[0m         input_ids,\n\u001b[1;32m   1304\u001b[0m         expand_size\u001b[39m=\u001b[39mnum_return_sequences,\n\u001b[1;32m   1305\u001b[0m         is_encoder_decoder\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig\u001b[39m.\u001b[39mis_encoder_decoder,\n\u001b[1;32m   1306\u001b[0m         \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mmodel_kwargs,\n\u001b[1;32m   1307\u001b[0m     )\n\u001b[1;32m   1309\u001b[0m     \u001b[39m# 12. run sample\u001b[39;00m\n\u001b[0;32m-> 1310\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msample(\n\u001b[1;32m   1311\u001b[0m         input_ids,\n\u001b[1;32m   1312\u001b[0m         logits_processor\u001b[39m=\u001b[39;49mlogits_processor,\n\u001b[1;32m   1313\u001b[0m         logits_warper\u001b[39m=\u001b[39;49mlogits_warper,\n\u001b[1;32m   1314\u001b[0m         stopping_criteria\u001b[39m=\u001b[39;49mstopping_criteria,\n\u001b[1;32m   1315\u001b[0m         pad_token_id\u001b[39m=\u001b[39;49mpad_token_id,\n\u001b[1;32m   1316\u001b[0m         eos_token_id\u001b[39m=\u001b[39;49meos_token_id,\n\u001b[1;32m   1317\u001b[0m         output_scores\u001b[39m=\u001b[39;49moutput_scores,\n\u001b[1;32m   1318\u001b[0m         return_dict_in_generate\u001b[39m=\u001b[39;49mreturn_dict_in_generate,\n\u001b[1;32m   1319\u001b[0m         synced_gpus\u001b[39m=\u001b[39;49msynced_gpus,\n\u001b[1;32m   1320\u001b[0m         \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mmodel_kwargs,\n\u001b[1;32m   1321\u001b[0m     )\n\u001b[1;32m   1323\u001b[0m \u001b[39melif\u001b[39;00m is_beam_gen_mode:\n\u001b[1;32m   1324\u001b[0m     \u001b[39mif\u001b[39;00m num_return_sequences \u001b[39m>\u001b[39m num_beams:\n",
      "File \u001b[0;32m~/anaconda3/envs/tortoise/lib/python3.9/site-packages/transformers/generation_utils.py:1926\u001b[0m, in \u001b[0;36mGenerationMixin.sample\u001b[0;34m(self, input_ids, logits_processor, stopping_criteria, logits_warper, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, **model_kwargs)\u001b[0m\n\u001b[1;32m   1923\u001b[0m model_inputs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mprepare_inputs_for_generation(input_ids, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mmodel_kwargs)\n\u001b[1;32m   1925\u001b[0m \u001b[39m# forward pass to get next token\u001b[39;00m\n\u001b[0;32m-> 1926\u001b[0m outputs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m(\n\u001b[1;32m   1927\u001b[0m     \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mmodel_inputs,\n\u001b[1;32m   1928\u001b[0m     return_dict\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m,\n\u001b[1;32m   1929\u001b[0m     output_attentions\u001b[39m=\u001b[39;49moutput_attentions,\n\u001b[1;32m   1930\u001b[0m     output_hidden_states\u001b[39m=\u001b[39;49moutput_hidden_states,\n\u001b[1;32m   1931\u001b[0m )\n\u001b[1;32m   1933\u001b[0m \u001b[39mif\u001b[39;00m synced_gpus \u001b[39mand\u001b[39;00m this_peer_finished:\n\u001b[1;32m   1934\u001b[0m     cur_len \u001b[39m=\u001b[39m cur_len \u001b[39m+\u001b[39m \u001b[39m1\u001b[39m\n",
      "File \u001b[0;32m~/anaconda3/envs/tortoise/lib/python3.9/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1496\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1497\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1498\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1499\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1500\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1502\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
      "File \u001b[0;32m/data/speech_synth/tortoise-tts/tortoise/models/autoregressive.py:142\u001b[0m, in \u001b[0;36mGPT2InferenceModel.forward\u001b[0;34m(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, labels, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m    139\u001b[0m     emb \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39membeddings(input_ids)\n\u001b[1;32m    140\u001b[0m     emb \u001b[39m=\u001b[39m emb \u001b[39m+\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtext_pos_embedding\u001b[39m.\u001b[39mget_fixed_embedding(attention_mask\u001b[39m.\u001b[39mshape[\u001b[39m1\u001b[39m]\u001b[39m-\u001b[39mmel_len, attention_mask\u001b[39m.\u001b[39mdevice)\n\u001b[0;32m--> 142\u001b[0m transformer_outputs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mtransformer(\n\u001b[1;32m    143\u001b[0m     inputs_embeds\u001b[39m=\u001b[39;49memb,\n\u001b[1;32m    144\u001b[0m     past_key_values\u001b[39m=\u001b[39;49mpast_key_values,\n\u001b[1;32m    145\u001b[0m     attention_mask\u001b[39m=\u001b[39;49mattention_mask,\n\u001b[1;32m    146\u001b[0m     token_type_ids\u001b[39m=\u001b[39;49mtoken_type_ids,\n\u001b[1;32m    147\u001b[0m     position_ids\u001b[39m=\u001b[39;49mposition_ids,\n\u001b[1;32m    148\u001b[0m     head_mask\u001b[39m=\u001b[39;49mhead_mask,\n\u001b[1;32m    149\u001b[0m     encoder_hidden_states\u001b[39m=\u001b[39;49mencoder_hidden_states,\n\u001b[1;32m    150\u001b[0m     encoder_attention_mask\u001b[39m=\u001b[39;49mencoder_attention_mask,\n\u001b[1;32m    151\u001b[0m     use_cache\u001b[39m=\u001b[39;49muse_cache,\n\u001b[1;32m    152\u001b[0m     output_attentions\u001b[39m=\u001b[39;49moutput_attentions,\n\u001b[1;32m    153\u001b[0m     output_hidden_states\u001b[39m=\u001b[39;49moutput_hidden_states,\n\u001b[1;32m    154\u001b[0m     return_dict\u001b[39m=\u001b[39;49mreturn_dict,\n\u001b[1;32m    155\u001b[0m )\n\u001b[1;32m    156\u001b[0m hidden_states \u001b[39m=\u001b[39m transformer_outputs[\u001b[39m0\u001b[39m]\n\u001b[1;32m    158\u001b[0m \u001b[39m# Set device for model parallelism\u001b[39;00m\n",
      "File \u001b[0;32m~/anaconda3/envs/tortoise/lib/python3.9/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1496\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1497\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1498\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1499\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1500\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1502\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
      "File \u001b[0;32m~/anaconda3/envs/tortoise/lib/python3.9/site-packages/transformers/models/gpt2/modeling_gpt2.py:889\u001b[0m, in \u001b[0;36mGPT2Model.forward\u001b[0;34m(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m    879\u001b[0m     outputs \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mutils\u001b[39m.\u001b[39mcheckpoint\u001b[39m.\u001b[39mcheckpoint(\n\u001b[1;32m    880\u001b[0m         create_custom_forward(block),\n\u001b[1;32m    881\u001b[0m         hidden_states,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    886\u001b[0m         encoder_attention_mask,\n\u001b[1;32m    887\u001b[0m     )\n\u001b[1;32m    888\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 889\u001b[0m     outputs \u001b[39m=\u001b[39m block(\n\u001b[1;32m    890\u001b[0m         hidden_states,\n\u001b[1;32m    891\u001b[0m         layer_past\u001b[39m=\u001b[39;49mlayer_past,\n\u001b[1;32m    892\u001b[0m         attention_mask\u001b[39m=\u001b[39;49mattention_mask,\n\u001b[1;32m    893\u001b[0m         head_mask\u001b[39m=\u001b[39;49mhead_mask[i],\n\u001b[1;32m    894\u001b[0m         encoder_hidden_states\u001b[39m=\u001b[39;49mencoder_hidden_states,\n\u001b[1;32m    895\u001b[0m         encoder_attention_mask\u001b[39m=\u001b[39;49mencoder_attention_mask,\n\u001b[1;32m    896\u001b[0m         use_cache\u001b[39m=\u001b[39;49muse_cache,\n\u001b[1;32m    897\u001b[0m         output_attentions\u001b[39m=\u001b[39;49moutput_attentions,\n\u001b[1;32m    898\u001b[0m     )\n\u001b[1;32m    900\u001b[0m hidden_states \u001b[39m=\u001b[39m outputs[\u001b[39m0\u001b[39m]\n\u001b[1;32m    901\u001b[0m \u001b[39mif\u001b[39;00m use_cache \u001b[39mis\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n",
      "File \u001b[0;32m~/anaconda3/envs/tortoise/lib/python3.9/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1496\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1497\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1498\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1499\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1500\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1502\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
      "File \u001b[0;32m~/anaconda3/envs/tortoise/lib/python3.9/site-packages/transformers/models/gpt2/modeling_gpt2.py:390\u001b[0m, in \u001b[0;36mGPT2Block.forward\u001b[0;34m(self, hidden_states, layer_past, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, use_cache, output_attentions)\u001b[0m\n\u001b[1;32m    388\u001b[0m residual \u001b[39m=\u001b[39m hidden_states\n\u001b[1;32m    389\u001b[0m hidden_states \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mln_1(hidden_states)\n\u001b[0;32m--> 390\u001b[0m attn_outputs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mattn(\n\u001b[1;32m    391\u001b[0m     hidden_states,\n\u001b[1;32m    392\u001b[0m     layer_past\u001b[39m=\u001b[39;49mlayer_past,\n\u001b[1;32m    393\u001b[0m     attention_mask\u001b[39m=\u001b[39;49mattention_mask,\n\u001b[1;32m    394\u001b[0m     head_mask\u001b[39m=\u001b[39;49mhead_mask,\n\u001b[1;32m    395\u001b[0m     use_cache\u001b[39m=\u001b[39;49muse_cache,\n\u001b[1;32m    396\u001b[0m     output_attentions\u001b[39m=\u001b[39;49moutput_attentions,\n\u001b[1;32m    397\u001b[0m )\n\u001b[1;32m    398\u001b[0m attn_output \u001b[39m=\u001b[39m attn_outputs[\u001b[39m0\u001b[39m]  \u001b[39m# output_attn: a, present, (attentions)\u001b[39;00m\n\u001b[1;32m    399\u001b[0m outputs \u001b[39m=\u001b[39m attn_outputs[\u001b[39m1\u001b[39m:]\n",
      "File \u001b[0;32m~/anaconda3/envs/tortoise/lib/python3.9/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1496\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1497\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1498\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1499\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1500\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1502\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
      "File \u001b[0;32m~/anaconda3/envs/tortoise/lib/python3.9/site-packages/transformers/models/gpt2/modeling_gpt2.py:290\u001b[0m, in \u001b[0;36mGPT2Attention.forward\u001b[0;34m(self, hidden_states, layer_past, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, use_cache, output_attentions)\u001b[0m\n\u001b[1;32m    287\u001b[0m     new_shape \u001b[39m=\u001b[39m tensor\u001b[39m.\u001b[39msize()[:\u001b[39m-\u001b[39m\u001b[39m2\u001b[39m] \u001b[39m+\u001b[39m (num_heads \u001b[39m*\u001b[39m attn_head_size,)\n\u001b[1;32m    288\u001b[0m     \u001b[39mreturn\u001b[39;00m tensor\u001b[39m.\u001b[39mview(new_shape)\n\u001b[0;32m--> 290\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mforward\u001b[39m(\n\u001b[1;32m    291\u001b[0m     \u001b[39mself\u001b[39m,\n\u001b[1;32m    292\u001b[0m     hidden_states: Optional[Tuple[torch\u001b[39m.\u001b[39mFloatTensor]],\n\u001b[1;32m    293\u001b[0m     layer_past: Optional[Tuple[torch\u001b[39m.\u001b[39mTensor]] \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m,\n\u001b[1;32m    294\u001b[0m     attention_mask: Optional[torch\u001b[39m.\u001b[39mFloatTensor] \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m,\n\u001b[1;32m    295\u001b[0m     head_mask: Optional[torch\u001b[39m.\u001b[39mFloatTensor] \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m,\n\u001b[1;32m    296\u001b[0m     encoder_hidden_states: Optional[torch\u001b[39m.\u001b[39mTensor] \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m,\n\u001b[1;32m    297\u001b[0m     encoder_attention_mask: Optional[torch\u001b[39m.\u001b[39mFloatTensor] \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m,\n\u001b[1;32m    298\u001b[0m     use_cache: Optional[\u001b[39mbool\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mFalse\u001b[39;00m,\n\u001b[1;32m    299\u001b[0m     output_attentions: Optional[\u001b[39mbool\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mFalse\u001b[39;00m,\n\u001b[1;32m    300\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Tuple[Union[torch\u001b[39m.\u001b[39mTensor, Tuple[torch\u001b[39m.\u001b[39mTensor]], \u001b[39m.\u001b[39m\u001b[39m.\u001b[39m\u001b[39m.\u001b[39m]:\n\u001b[1;32m    301\u001b[0m     \u001b[39mif\u001b[39;00m encoder_hidden_states \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m    302\u001b[0m         \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mhasattr\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mq_attn\u001b[39m\u001b[39m\"\u001b[39m):\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "# Pick one of the voices from the output above\n",
    "voice = 'tom'\n",
    "\n",
    "# Load it and send it through Tortoise.\n",
    "voice_samples, conditioning_latents = load_voice(voice)\n",
    "gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, \n",
    "                          preset=preset)\n",
    "torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)\n",
    "IPython.display.Audio('generated.wav')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "16Xs2SSC3BXa"
   },
   "outputs": [],
   "source": [
    "# Tortoise can also generate speech using a random voice. The voice changes each time you execute this!\n",
    "# (Note: random voices can be prone to strange utterances)\n",
    "gen = tts.tts_with_preset(text, voice_samples=None, conditioning_latents=None, preset=preset)\n",
    "torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)\n",
    "IPython.display.Audio('generated.wav')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "fYTk8KUezUr5"
   },
   "outputs": [],
   "source": [
    "# You can also combine conditioning voices. Combining voices produces a new voice\n",
    "# with traits from all the parents.\n",
    "#\n",
    "# Lets see what it would sound like if Picard and Kirk had a kid with a penchant for philosophy:\n",
    "voice_samples, conditioning_latents = load_voices(['pat', 'william'])\n",
    "\n",
    "gen = tts.tts_with_preset(\"They used to say that if man was meant to fly, he’d have wings. But he did fly. He discovered he had to.\", \n",
    "                          voice_samples=None, conditioning_latents=None, preset=preset)\n",
    "torchaudio.save('captain_kirkard.wav', gen.squeeze(0).cpu(), 24000)\n",
    "IPython.display.Audio('captain_kirkard.wav')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "t66yqWgu68KL"
   },
   "outputs": [],
   "source": [
    "del tts  # Will break other cells, but necessary to conserve RAM if you want to run this cell.\n",
    "\n",
    "# Tortoise comes with some scripts that does a lot of the lifting for you. For example,\n",
    "# read.py will read a text file for you.\n",
    "!python3 tortoise/read.py --voice=train_atkins --textfile=tortoise/data/riding_hood.txt --preset=ultra_fast --output_path=.\n",
    "\n",
    "IPython.display.Audio('train_atkins/combined.wav')\n",
    "# This will take awhile.."
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "tortoise-tts.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}