Merge pull request #7452 from oobabooga/dev

Merge dev branch
2026-04-06 07:03:37 +00:00 · 2026-04-02 22:18:46 -03:00 · 2026-04-02 22:18:46 -03:00 · ae699ac570
commit ae699ac570
parent dd9d254c49 7aab2fdf9a
50 changed files with 1160 additions and 651 deletions
--- a/.github/workflows/build-everything-tgw.yml
+++ b/.github/workflows/build-everything-tgw.yml
@ -68,3 +68,31 @@ jobs:
    with:
      version: ${{ inputs.version }}
      config: 'os:macos-15-intel,macos-14'
+
+  build_release_ik_cuda_windows:
+    name: ik CUDA Windows
+    uses: ./.github/workflows/build-portable-release-ik-cuda.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:windows-2022'
+
+  build_release_ik_cuda_linux:
+    name: ik CUDA Linux
+    uses: ./.github/workflows/build-portable-release-ik-cuda.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:ubuntu-22.04'
+
+  build_release_ik_cpu_windows:
+    name: ik CPU Windows
+    uses: ./.github/workflows/build-portable-release-ik.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:windows-2022'
+
+  build_release_ik_cpu_linux:
+    name: ik CPU Linux
+    uses: ./.github/workflows/build-portable-release-ik.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:ubuntu-22.04'
--- a/.github/workflows/build-portable-release-ik-cuda.yml
+++ b/.github/workflows/build-portable-release-ik-cuda.yml
@ -0,0 +1,178 @@
+name: Build ik CUDA
+
+on:
+  workflow_dispatch:
+    inputs:
+      version:
+        description: 'Version tag of text-generation-webui to build: v3.0'
+        default: 'v3.0'
+        required: true
+        type: string
+      config:
+        description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+        default: 'Default'
+        required: false
+        type: string
+      exclude:
+        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+        default: 'None'
+        required: false
+        type: string
+  workflow_call:
+    inputs:
+      version:
+        description: 'Version tag of text-generation-webui to build: v3.0'
+        default: 'v3.0'
+        required: true
+        type: string
+      config:
+        description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+        default: 'Default'
+        required: false
+        type: string
+      exclude:
+        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+        default: 'None'
+        required: false
+        type: string
+
+permissions:
+  contents: write
+
+jobs:
+  define_matrix:
+    name: Define Build Matrix
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      CONFIGIN: ${{ inputs.config }}
+      EXCLUDEIN: ${{ inputs.exclude }}
+
+    steps:
+      - name: Define Job Output
+        id: set-matrix
+        run: |
+          $matrix = @{
+              'os' = @('ubuntu-22.04', 'windows-2022')
+              'pyver' = @("3.13")
+              'cuda' = @("12.4", "13.1")
+          }
+
+          if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
+
+          if ($env:EXCLUDEIN -ne 'None') {
+              $exclusions = @()
+              $exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
+              $matrix['exclude'] = $exclusions
+          }
+
+          $matrixOut = ConvertTo-Json $matrix -Compress
+          Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
+
+  build_wheels:
+    name: ${{ matrix.os }} ${{ matrix.pyver }} CUDA ${{ matrix.cuda }}
+    needs: define_matrix
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      PCKGVER: ${{ inputs.version }}
+
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          repository: 'oobabooga/text-generation-webui'
+          ref: ${{ inputs.version }}
+          submodules: 'recursive'
+
+      - uses: actions/setup-python@v6
+        with:
+          python-version: ${{ matrix.pyver }}
+
+      - name: Build Package
+        shell: bash
+        run: |
+            VERSION_CLEAN="${{ inputs.version }}"
+            VERSION_CLEAN="${VERSION_CLEAN#v}"
+            cd ..
+            cp -r text-generation-webui "text-generation-webui-${VERSION_CLEAN}"
+            cd "text-generation-webui-${VERSION_CLEAN}"
+
+            # Remove extensions that need additional requirements
+            allowed=("character_bias" "gallery" "sd_api_pictures")
+            find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
+
+            # Define common variables
+            CUDA_VERSION="${{ matrix.cuda }}"
+            VERSION="${{ inputs.version }}"
+
+            # 1. Set platform-specific variables
+            if [[ "$RUNNER_OS" == "Windows" ]]; then
+                PLATFORM="windows"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only_stripped.tar.gz"
+                PIP_PATH="portable_env/python.exe -m pip"
+                PACKAGES_PATH="portable_env/Lib/site-packages"
+                rm start_linux.sh start_macos.sh
+            else
+                PLATFORM="linux"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz"
+                PIP_PATH="portable_env/bin/python -m pip"
+                PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
+                rm start_macos.sh start_windows.bat
+            fi
+
+            # 2. Download and extract Python
+            cd ..
+            echo "Downloading Python for $PLATFORM..."
+            curl -L -o python-build.tar.gz "$PYTHON_URL"
+            tar -xzf python-build.tar.gz
+            mv python "text-generation-webui-${VERSION_CLEAN}/portable_env"
+
+            # 3. Prepare requirements file based on CUDA version
+            cd "text-generation-webui-${VERSION_CLEAN}"
+            if [[ "$CUDA_VERSION" == "13.1" ]]; then
+                REQ_FILE="requirements/portable/requirements_ik_cuda131.txt"
+            else
+                REQ_FILE="requirements/portable/requirements_ik.txt"
+            fi
+
+            # 4. Inject --ik into start scripts
+            sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true
+
+            # 5. Install packages
+            echo "Installing Python packages from $REQ_FILE..."
+            $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
+
+            # 6. Clean up
+            rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py
+
+            # 7. Create archive
+            cd ..
+            if [[ "$RUNNER_OS" == "Windows" ]]; then
+                ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.zip"
+                echo "Creating archive: $ARCHIVE_NAME"
+                powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME"
+            else
+                ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.tar.gz"
+                echo "Creating archive: $ARCHIVE_NAME"
+                tar czf "$ARCHIVE_NAME" "text-generation-webui-${VERSION_CLEAN}"
+            fi
+
+      - name: Upload files to a GitHub release
+        id: upload-release
+        uses: svenstaro/upload-release-action@2.7.0
+        continue-on-error: true
+        with:
+          repo_token: ${{ secrets.GITHUB_TOKEN }}
+          file: ../textgen-portable-ik-*
+          tag: ${{ inputs.version }}
+          file_glob: true
+          make_latest: false
+          overwrite: true
--- a/.github/workflows/build-portable-release-ik.yml
+++ b/.github/workflows/build-portable-release-ik.yml
@ -0,0 +1,173 @@
+name: Build ik CPU
+
+on:
+  workflow_dispatch:
+    inputs:
+      version:
+        description: 'Version tag of text-generation-webui to build: v3.0'
+        default: 'v3.0'
+        required: true
+        type: string
+      config:
+        description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+        default: 'Default'
+        required: false
+        type: string
+      exclude:
+        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+        default: 'None'
+        required: false
+        type: string
+  workflow_call:
+    inputs:
+      version:
+        description: 'Version tag of text-generation-webui to build: v3.0'
+        default: 'v3.0'
+        required: true
+        type: string
+      config:
+        description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+        default: 'Default'
+        required: false
+        type: string
+      exclude:
+        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+        default: 'None'
+        required: false
+        type: string
+
+permissions:
+  contents: write
+
+jobs:
+  define_matrix:
+    name: Define Build Matrix
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      CONFIGIN: ${{ inputs.config }}
+      EXCLUDEIN: ${{ inputs.exclude }}
+
+    steps:
+      - name: Define Job Output
+        id: set-matrix
+        run: |
+          $matrix = @{
+              'os' = @('ubuntu-22.04', 'windows-2022')
+              'pyver' = @("3.13")
+          }
+
+          if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
+
+          if ($env:EXCLUDEIN -ne 'None') {
+              $exclusions = @()
+              $exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
+              $matrix['exclude'] = $exclusions
+          }
+
+          $matrixOut = ConvertTo-Json $matrix -Compress
+          Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
+
+  build_wheels:
+    name: ${{ matrix.os }} ${{ matrix.pyver }}
+    needs: define_matrix
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      PCKGVER: ${{ inputs.version }}
+
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          repository: 'oobabooga/text-generation-webui'
+          ref: ${{ inputs.version }}
+          submodules: 'recursive'
+
+      - uses: actions/setup-python@v6
+        with:
+          python-version: ${{ matrix.pyver }}
+
+      - name: Build Package
+        shell: bash
+        run: |
+            VERSION_CLEAN="${{ inputs.version }}"
+            VERSION_CLEAN="${VERSION_CLEAN#v}"
+            cd ..
+            cp -r text-generation-webui "text-generation-webui-${VERSION_CLEAN}"
+            cd "text-generation-webui-${VERSION_CLEAN}"
+
+            # Remove extensions that need additional requirements
+            allowed=("character_bias" "gallery" "sd_api_pictures")
+            find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
+
+            # Define common variables
+            VERSION="${{ inputs.version }}"
+
+            # 1. Set platform-specific variables
+            if [[ "$RUNNER_OS" == "Windows" ]]; then
+                PLATFORM="windows-cpu"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only_stripped.tar.gz"
+                PIP_PATH="portable_env/python.exe -m pip"
+                PACKAGES_PATH="portable_env/Lib/site-packages"
+                rm start_linux.sh start_macos.sh
+            else
+                PLATFORM="linux-cpu"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz"
+                PIP_PATH="portable_env/bin/python -m pip"
+                PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
+                rm start_macos.sh start_windows.bat
+            fi
+
+            # 2. Download and extract Python
+            echo "Downloading Python for $PLATFORM..."
+            cd ..
+            curl -L -o python-build.tar.gz "$PYTHON_URL"
+            tar -xzf python-build.tar.gz
+            mv python "text-generation-webui-${VERSION_CLEAN}/portable_env"
+
+            # 3. Prepare requirements file
+            cd "text-generation-webui-${VERSION_CLEAN}"
+            REQ_FILE="requirements/portable/requirements_ik_cpu_only.txt"
+            echo "Using requirements file: $REQ_FILE"
+
+            # 4. Inject --ik into start scripts
+            sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true
+
+            # 5. Install packages
+            echo "Installing Python packages from $REQ_FILE..."
+            $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
+
+            # 6. Clean up
+            rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py
+
+            # 7. Create archive
+            cd ..
+            if [[ "$RUNNER_OS" == "Windows" ]]; then
+                ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}.zip"
+                echo "Creating archive: $ARCHIVE_NAME"
+                powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME"
+            else
+                ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}.tar.gz"
+                echo "Creating archive: $ARCHIVE_NAME"
+                tar czf "$ARCHIVE_NAME" "text-generation-webui-${VERSION_CLEAN}"
+            fi
+
+      - name: Upload files to a GitHub release
+        id: upload-release
+        uses: svenstaro/upload-release-action@2.7.0
+        continue-on-error: true
+        with:
+          repo_token: ${{ secrets.GITHUB_TOKEN }}
+          file: ../textgen-portable-ik-*
+          tag: ${{ inputs.version }}
+          file_glob: true
+          make_latest: false
+          overwrite: true
--- a/docs/01
+++ b/docs/01
@ -112,7 +112,7 @@ Used for talking to an instruction-following model using the prompt format defin

 The prompt format is defined by the **Instruction template** parameter in "Parameters" > "Instruction template", which represents a Jinja2 template.

-Note that when you load a model in the "Model" tab, the web UI will try to automatically detect its instruction template (if any), and will update the values under "Parameters" > "Instruction template" accordingly. This is done using a set of regular expressions defined in `user_data/models/config.yaml`. This detection is not guaranteed to be accurate. You should check the model card on Hugging Face to see if you are using the correct prompt format.
+Note that when you load a model in the "Model" tab, the web UI will try to automatically detect its instruction template (if any) from the model metadata (e.g. `tokenizer_config.json` or GGUF metadata), and will update the values under "Parameters" > "Instruction template" accordingly. You should check the model card on Hugging Face to see if you are using the correct prompt format.

 ### Chat-instruct

--- a/docs/12
+++ b/docs/12
@ -39,7 +39,7 @@ curl http://127.0.0.1:5000/v1/completions \

 #### Chat completions

-Works best with instruction-following models. If the "instruction_template" variable is not provided, it will be guessed automatically based on the model name using the regex patterns in `user_data/models/config.yaml`.
+Works best with instruction-following models. If the "instruction_template" variable is not provided, it will be detected automatically from the model metadata.

 ```shell
 curl http://127.0.0.1:5000/v1/chat/completions \
--- a/download-model.py
+++ b/download-model.py
@ -158,28 +158,21 @@ class ModelDownloader:
        # Also if GGUF and safetensors are available, download only safetensors
        if (has_pytorch or has_pt or has_gguf) and has_safetensors:
            has_gguf = False
-            for i in range(len(classifications) - 1, -1, -1):
-                if classifications[i] in ['pytorch', 'pt', 'gguf']:
-                    links.pop(i)
-                    file_sizes.pop(i)
+            keep = [i for i, c in enumerate(classifications) if c not in ['pytorch', 'pt', 'gguf']]
+            links = [links[i] for i in keep]
+            file_sizes = [file_sizes[i] for i in keep]

        # For GGUF, try to download only the Q4_K_M if no specific file is specified.
        if has_gguf and specific_file is None:
-            has_q4km = False
-            for i in range(len(classifications) - 1, -1, -1):
-                if 'q4_k_m' in links[i].lower():
-                    has_q4km = True
+            has_q4km = any('q4_k_m' in link.lower() for link in links)

            if has_q4km:
-                for i in range(len(classifications) - 1, -1, -1):
-                    if 'q4_k_m' not in links[i].lower():
-                        links.pop(i)
-                        file_sizes.pop(i)
+                keep = [i for i, link in enumerate(links) if 'q4_k_m' in link.lower()]
            else:
-                for i in range(len(classifications) - 1, -1, -1):
-                    if links[i].lower().endswith('.gguf'):
-                        links.pop(i)
-                        file_sizes.pop(i)
+                keep = [i for i, link in enumerate(links) if not link.lower().endswith('.gguf')]
+
+            links = [links[i] for i in keep]
+            file_sizes = [file_sizes[i] for i in keep]

        is_llamacpp = has_gguf and specific_file is not None
        return links, sha256, is_lora, is_llamacpp, file_sizes
--- a/extensions/superbooga/download_urls.py
+++ b/extensions/superbooga/download_urls.py
@ -2,8 +2,11 @@ import concurrent.futures

 import requests

+from modules.web_search import _validate_url
+

 def download_single(url):
+    _validate_url(url)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
--- a/extensions/superboogav2/download_urls.py
+++ b/extensions/superboogav2/download_urls.py
@ -5,12 +5,14 @@ import requests
 from bs4 import BeautifulSoup

 import extensions.superboogav2.parameters as parameters
+from modules.web_search import _validate_url

 from .data_processor import process_and_add_to_collector
 from .utils import create_metadata_source


 def _download_single(url):
+    _validate_url(url)
    response = requests.get(url, timeout=5)
    if response.status_code == 200:
        return response.content
--- a/js/dark_theme.js
+++ b/js/dark_theme.js
@ -1,6 +1,6 @@
 function toggleDarkMode() {
  document.body.classList.toggle("dark");
-  var currentCSS = document.getElementById("highlight-css");
+  const currentCSS = document.getElementById("highlight-css");
  if (currentCSS.getAttribute("href") === "file/css/highlightjs/github-dark.min.css") {
    currentCSS.setAttribute("href", "file/css/highlightjs/github.min.css");
  } else {
@ -9,12 +9,10 @@ function toggleDarkMode() {

  // Re-highlight all code blocks once stylesheet loads
  currentCSS.onload = function() {
-    const messageBodies = document.getElementById("chat").querySelectorAll(".message-body");
-    messageBodies.forEach((messageBody) => {
-      const codeBlocks = messageBody.querySelectorAll("pre code");
-      codeBlocks.forEach((codeBlock) => {
-        hljs.highlightElement(codeBlock);
-      });
+    // Clear data-highlighted so hljs will re-process with the new theme
+    document.querySelectorAll("#chat .message-body pre code[data-highlighted]").forEach((codeBlock) => {
+      delete codeBlock.dataset.highlighted;
    });
+    doSyntaxHighlighting();
  };
 }
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@ -1,11 +1,35 @@
+// -------------------------------------------------
+// Shared helpers
+// -------------------------------------------------
+
+function getProfilePictureUrl() {
+  return "/file/user_data/cache/pfp_character.png?time=" + Date.now();
+}
+
+const MESSAGE_SELECTOR = ".message, .user-message, .assistant-message";
+
+function getMessageElement(element) {
+  if (!element) return null;
+  return element.closest(MESSAGE_SELECTOR);
+}
+
+function isUserRole(messageElement) {
+  return messageElement.classList.contains("user-message") ||
+         messageElement.querySelector(".text-you") !== null ||
+         messageElement.querySelector(".circle-you") !== null;
+}
+
+// Trigger a synthetic 'input' event so Gradio picks up programmatic value changes
+function dispatchGradioInput(element) {
+  element.dispatchEvent(new Event("input", { bubbles: true }));
+}
+
 // -------------------------------------------------
 // Event handlers
 // -------------------------------------------------

 function copyToClipboard(element) {
-  if (!element) return;
-
-  const messageElement = element.closest(".message, .user-message, .assistant-message");
+  const messageElement = getMessageElement(element);
  if (!messageElement) return;

  const rawText = messageElement.getAttribute("data-raw");
@ -48,9 +72,7 @@ function fallbackCopyToClipboard(text) {
 }

 function branchHere(element) {
-  if (!element) return;
-
-  const messageElement = element.closest(".message, .user-message, .assistant-message");
+  const messageElement = getMessageElement(element);
  if (!messageElement) return;

  const index = messageElement.getAttribute("data-index");
@ -69,11 +91,7 @@ function branchHere(element) {
  }

  branchIndexInput.value = index;
-
-  // Trigger any 'change' or 'input' events Gradio might be listening for
-  const event = new Event("input", { bubbles: true });
-  branchIndexInput.dispatchEvent(event);
-
+  dispatchGradioInput(branchIndexInput);
  branchButton.click();
 }

@ -82,9 +100,7 @@ function branchHere(element) {
 // -------------------------------------------------

 function editHere(buttonElement) {
-  if (!buttonElement) return;
-
-  const messageElement = buttonElement.closest(".message, .user-message, .assistant-message");
+  const messageElement = getMessageElement(buttonElement);
  if (!messageElement) return;

  const messageBody = messageElement.querySelector(".message-body");
@ -97,12 +113,7 @@ function editHere(buttonElement) {
    return;
  }

-  // Determine role based on message element - handle different chat modes
-  const isUserMessage = messageElement.classList.contains("user-message") ||
-                       messageElement.querySelector(".text-you") !== null ||
-                       messageElement.querySelector(".circle-you") !== null;
-
-  startEditing(messageElement, messageBody, isUserMessage);
+  startEditing(messageElement, messageBody, isUserRole(messageElement));
 }

 function startEditing(messageElement, messageBody, isUserMessage) {
@ -209,30 +220,22 @@ function submitMessageEdit(index, newText, isUserMessage) {
  editTextInput.value = newText;
  editRoleInput.value = isUserMessage ? "user" : "assistant";

-  editIndexInput.dispatchEvent(new Event("input", { bubbles: true }));
-  editTextInput.dispatchEvent(new Event("input", { bubbles: true }));
-  editRoleInput.dispatchEvent(new Event("input", { bubbles: true }));
+  dispatchGradioInput(editIndexInput);
+  dispatchGradioInput(editTextInput);
+  dispatchGradioInput(editRoleInput);

  editButton.click();
  return true;
 }

 function navigateVersion(element, direction) {
-  if (!element) return;
-
-  const messageElement = element.closest(".message, .user-message, .assistant-message");
+  const messageElement = getMessageElement(element);
  if (!messageElement) return;

  const index = messageElement.getAttribute("data-index");
  if (!index) return;

-  // Determine role based on message element classes
-  let role = "assistant"; // Default role
-  if (messageElement.classList.contains("user-message") ||
-      messageElement.querySelector(".text-you") ||
-      messageElement.querySelector(".circle-you")) {
-    role = "user";
-  }
+  const role = isUserRole(messageElement) ? "user" : "assistant";

  const indexInput = document.getElementById("Navigate-message-index")?.querySelector("input");
  const directionInput = document.getElementById("Navigate-direction")?.querySelector("textarea");
@ -248,11 +251,9 @@ function navigateVersion(element, direction) {
  directionInput.value = direction;
  roleInput.value = role;

-  // Trigger 'input' events for Gradio to pick up changes
-  const event = new Event("input", { bubbles: true });
-  indexInput.dispatchEvent(event);
-  directionInput.dispatchEvent(event);
-  roleInput.dispatchEvent(event);
+  dispatchGradioInput(indexInput);
+  dispatchGradioInput(directionInput);
+  dispatchGradioInput(roleInput);

  navigateButton.click();
 }
@ -313,7 +314,7 @@ function handleMorphdomUpdate(data) {

 function applyMorphdomUpdate(data) {
  // Determine target element and use it as query scope
-  var target_element, target_html;
+  let target_element, target_html;
  if (data.last_message_only) {
    const childNodes = document.getElementsByClassName("messages")[0].childNodes;
    target_element = childNodes[childNodes.length - 1];
--- a/js/main.js
+++ b/js/main.js
@ -4,8 +4,9 @@

 // Sync highlight.js theme with the actual Gradio theme
 var defined_hljs_css = document.body.classList.contains("dark") ? "file/css/highlightjs/github-dark.min.css" : "file/css/highlightjs/github.min.css";
-if (document.getElementById("highlight-css").getAttribute("href") !== defined_hljs_css) {
-  document.getElementById("highlight-css").setAttribute("href", defined_hljs_css);
+var hljsCssElement = document.getElementById("highlight-css");
+if (hljsCssElement.getAttribute("href") !== defined_hljs_css) {
+  hljsCssElement.setAttribute("href", defined_hljs_css);
 }

 let main_parent = document.getElementById("chat-tab").parentNode;
@ -49,21 +50,18 @@ document.querySelector(".header_bar").addEventListener("click", function(event)
 //------------------------------------------------

 // --- Helper functions --- //
-function isModifiedKeyboardEvent() {
-  return (event instanceof KeyboardEvent &&
-    event.shiftKey ||
-    event.ctrlKey ||
-    event.altKey ||
-    event.metaKey);
+function isModifiedKeyboardEvent(event) {
+  return event instanceof KeyboardEvent &&
+    (event.shiftKey || event.ctrlKey || event.altKey || event.metaKey);
 }

-function isFocusedOnEditableTextbox() {
+function isFocusedOnEditableTextbox(event) {
  if (event.target.tagName === "INPUT" || event.target.tagName === "TEXTAREA") {
    return !!event.target.value;
  }
+  return false;
 }

-let previousTabId = "chat-tab-button";
 document.addEventListener("keydown", function(event) {
  // Stop generation on Esc pressed
  if (event.key === "Escape") {
@ -117,14 +115,14 @@ document.addEventListener("keydown", function(event) {
  }

  // --- Simple version navigation --- //
-  if (!isFocusedOnEditableTextbox()) {
+  if (!isFocusedOnEditableTextbox(event)) {
    // Version navigation on Arrow keys (horizontal)
-    if (!isModifiedKeyboardEvent() && event.key === "ArrowLeft") {
+    if (!isModifiedKeyboardEvent(event) && event.key === "ArrowLeft") {
      event.preventDefault();
      navigateLastAssistantMessage("left");
    }

-    else if (!isModifiedKeyboardEvent() && event.key === "ArrowRight") {
+    else if (!isModifiedKeyboardEvent(event) && event.key === "ArrowRight") {
      event.preventDefault();
      if (!navigateLastAssistantMessage("right")) {
        // If can't navigate right (last version), regenerate
@ -159,9 +157,8 @@ targetElement.addEventListener("scroll", function() {
  let diff = targetElement.scrollHeight - targetElement.clientHeight;
  let isAtBottomNow = Math.abs(targetElement.scrollTop - diff) <= 10 || diff <= 0;

-  // Add scrolling class to disable hover effects
  if (window.isScrolled || !isAtBottomNow) {
-    targetElement.classList.add("scrolling");
+    targetElement.classList.add("scrolling"); // Disables hover effects during scroll
  }

  if(isAtBottomNow) {
@ -202,12 +199,8 @@ const observer = new MutationObserver(function() {
 });

 // Only watch for attribute changes on targetElement (e.g. _generating class)
-const config = {
-  attributes: true
-};
-
 // Start observing the target element
-observer.observe(targetElement, config);
+observer.observe(targetElement, { attributes: true });

 //------------------------------------------------
 // Handle syntax highlighting / LaTeX
@ -228,7 +221,7 @@ window.doSyntaxHighlighting = function() {
  if (messageBodies.length > 0) {
    let hasSeenVisible = false;

-    // Go from last message to first
+    // Go from last message to first so we can early-exit once past visible area
    for (let i = messageBodies.length - 1; i >= 0; i--) {
      const messageBody = messageBodies[i];

@ -243,8 +236,8 @@ window.doSyntaxHighlighting = function() {
          codeBlock.classList.add("pretty_scrollbar");
        });

-        // Only render math in visible elements
        const mathContainers = messageBody.querySelectorAll("p, span, li, td, th, h1, h2, h3, h4, h5, h6, blockquote, figcaption, caption, dd, dt");
+        // Only render math in individually visible containers (the outer check is on the message body)
        mathContainers.forEach(container => {
          if (isElementVisibleOnScreen(container)) {
            renderMathInElement(container, {
@ -271,7 +264,7 @@ const doSyntaxHighlighting = window.doSyntaxHighlighting;
 // Add some scrollbars
 //------------------------------------------------
 const scrollbarElements = document.querySelectorAll(".add_scrollbar textarea, .add_scrollbar .drag-drop-list");
-for(i = 0; i < scrollbarElements.length; i++) {
+for(let i = 0; i < scrollbarElements.length; i++) {
  scrollbarElements[i].classList.remove("scroll-hide");
  scrollbarElements[i].classList.add("pretty_scrollbar");
  scrollbarElements[i].style.resize = "none";
@ -298,13 +291,13 @@ if (toolsInfo) {
 // Remove some backgrounds
 //------------------------------------------------
 const noBackgroundelements = document.querySelectorAll(".no-background");
-for(i = 0; i < noBackgroundelements.length; i++) {
+for(let i = 0; i < noBackgroundelements.length; i++) {
  noBackgroundelements[i].parentNode.style.border = "none";
  noBackgroundelements[i].parentNode.parentNode.parentNode.style.alignItems = "center";
 }

 const slimDropdownElements = document.querySelectorAll(".slim-dropdown");
-for (i = 0; i < slimDropdownElements.length; i++) {
+for (let i = 0; i < slimDropdownElements.length; i++) {
  const parentNode = slimDropdownElements[i].parentNode;
  parentNode.style.background = "transparent";
  parentNode.style.border = "0";
@ -374,49 +367,43 @@ button.addEventListener("click", function () {
  }
 });

-// Add event listener for mouseleave on the button
-button.addEventListener("mouseleave", function () {
-  // Delay to prevent menu hiding when the mouse leaves the button into the menu
+// Delay to prevent menu hiding when the mouse leaves the button or menu
+function delayedHideMenu() {
  setTimeout(function () {
    if (!isMouseOverButtonOrMenu()) {
      hideMenu();
    }
  }, 100);
-});
+}

+// Add event listener for mouseleave on the button
+button.addEventListener("mouseleave", delayedHideMenu);
 // Add event listener for mouseleave on the menu
-menu.addEventListener("mouseleave", function () {
-  // Delay to prevent menu hide when the mouse leaves the menu into the button
-  setTimeout(function () {
-    if (!isMouseOverButtonOrMenu()) {
-      hideMenu();
-    }
-  }, 100);
-});
+menu.addEventListener("mouseleave", delayedHideMenu);

 // Add event listener for click anywhere in the document
 document.addEventListener("click", function (event) {
-  const target = event.target;
-
  // Check if the click is outside the button/menu and the menu is visible
  if (!isMouseOverButtonOrMenu() && menu.style.display === "flex") {
    hideMenu();
  }

-  if (event.target.classList.contains("pfp_character")) {
+  const target = event.target;
+
+  if (target.classList.contains("pfp_character")) {
    toggleBigPicture();
  }

  // Handle sidebar clicks on mobile
  if (isMobile()) {
-  // Check if the click did NOT originate from any of the specified toggle buttons or elements
+    // Check if the click did NOT originate from any of the specified toggle buttons or elements
    if (
      target.closest("#navigation-toggle") !== navigationToggle &&
-    target.closest("#past-chats-toggle") !== pastChatsToggle &&
-    target.closest("#chat-controls-toggle") !== chatControlsToggle &&
-    target.closest(".header_bar") !== headerBar &&
-    target.closest("#past-chats-row") !== pastChatsRow &&
-    target.closest("#chat-controls") !== chatControlsRow
+      target.closest("#past-chats-toggle") !== pastChatsToggle &&
+      target.closest("#chat-controls-toggle") !== chatControlsToggle &&
+      target.closest(".header_bar") !== headerBar &&
+      target.closest("#past-chats-row") !== pastChatsRow &&
+      target.closest("#chat-controls") !== chatControlsRow
    ) {
      handleIndividualSidebarClose(event);
    }
@ -433,27 +420,19 @@ document.getElementById("chat-input-row").classList.add("chat-input-positioned")
 //------------------------------------------------
 const chatTextArea = document.getElementById("chat-input").querySelector("textarea");

-function respondToChatInputVisibility(element, callback) {
-  var options = {
-    root: document.documentElement,
-  };
-
-  var observer = new IntersectionObserver((entries, observer) => {
+function focusOnVisible(element) {
+  var observer = new IntersectionObserver((entries) => {
    entries.forEach(entry => {
-      callback(entry.intersectionRatio > 0);
+      if (entry.intersectionRatio > 0) {
+        element.focus();
+      }
    });
-  }, options);
+  }, { root: document.documentElement });

  observer.observe(element);
 }

-function handleChatInputVisibilityChange(isVisible) {
-  if (isVisible) {
-    chatTextArea.focus();
-  }
-}
-
-respondToChatInputVisibility(chatTextArea, handleChatInputVisibilityChange);
+focusOnVisible(chatTextArea);

 //------------------------------------------------
 // Show enlarged character picture when the profile
@ -463,8 +442,7 @@ let bigPictureVisible = false;

 function addBigPicture() {
  var imgElement = document.createElement("img");
-  var timestamp = new Date().getTime();
-  imgElement.src = "/file/user_data/cache/pfp_character.png?time=" + timestamp;
+  imgElement.src = getProfilePictureUrl();
  imgElement.classList.add("bigProfilePicture");
  imgElement.addEventListener("load", function () {
    this.style.visibility = "visible";
@ -478,9 +456,8 @@ function addBigPicture() {
 }

 function deleteBigPicture() {
-  var bigProfilePictures = document.querySelectorAll(".bigProfilePicture");
-  bigProfilePictures.forEach(function (element) {
-    element.parentNode.removeChild(element);
+  document.querySelectorAll(".bigProfilePicture").forEach(function (element) {
+    element.remove();
  });
 }

@ -494,44 +471,11 @@ function toggleBigPicture() {
  }
 }

-//------------------------------------------------
-// Handle the chat input box growth
-//------------------------------------------------
-
-// Cache DOM elements
-const chatContainer = document.getElementById("chat").parentNode.parentNode.parentNode;
-const chatInput = document.querySelector("#chat-input textarea");
-
-// Variables to store current dimensions
-let currentChatInputHeight = chatInput.clientHeight;
-
 //------------------------------------------------
 // Focus on the rename text area when it becomes visible
 //------------------------------------------------
 const renameTextArea = document.getElementById("rename-row").querySelector("textarea");
-
-function respondToRenameVisibility(element, callback) {
-  var options = {
-    root: document.documentElement,
-  };
-
-  var observer = new IntersectionObserver((entries, observer) => {
-    entries.forEach(entry => {
-      callback(entry.intersectionRatio > 0);
-    });
-  }, options);
-
-  observer.observe(element);
-}
-
-
-function handleVisibilityChange(isVisible) {
-  if (isVisible) {
-    renameTextArea.focus();
-  }
-}
-
-respondToRenameVisibility(renameTextArea, handleVisibilityChange);
+focusOnVisible(renameTextArea);

 //------------------------------------------------
 // Adjust the chat tab margin if no extension UI
@ -737,21 +681,21 @@ function handleIndividualSidebarClose(event) {

  // Close navigation bar if click is outside and it is open
  if (!headerBar.contains(target) && !headerBar.classList.contains("sidebar-hidden")) {
-    toggleSidebar(headerBar, navigationToggle, true);
+    toggleSidebar(headerBar, navigationToggle);
  }

  // Close past chats row if click is outside and it is open
  if (!pastChatsRow.contains(target) && !pastChatsRow.classList.contains("sidebar-hidden")) {
-    toggleSidebar(pastChatsRow, pastChatsToggle, true);
+    toggleSidebar(pastChatsRow, pastChatsToggle);
  }

  // Close chat controls row if click is outside and it is open
  if (!chatControlsRow.contains(target) && !chatControlsRow.classList.contains("sidebar-hidden")) {
-    toggleSidebar(chatControlsRow, chatControlsToggle, true);
+    toggleSidebar(chatControlsRow, chatControlsToggle);
  }
 }

-function toggleSidebar(sidebar, toggle, forceClose = false) {
+function toggleSidebar(sidebar, toggle) {
  const isCurrentlyHidden = sidebar.classList.contains("sidebar-hidden");
  const shouldClose = !isCurrentlyHidden;

@ -776,11 +720,6 @@ function toggleSidebar(sidebar, toggle, forceClose = false) {
    toggle.classList.toggle("chat-controls-open", !shouldClose);
    toggle.innerHTML = shouldClose ? leftArrowSVG : rightArrowSVG;
  }
-
-  // Mobile handling
-  if (isMobile()) {
-    sidebar.classList.toggle("sidebar-shown", !shouldClose);
-  }
 }

 // Function to check if the device is mobile
@ -840,17 +779,17 @@ pastChatsToggle.addEventListener("click", () => {
  const isCurrentlyOpen = !pastChatsRow.classList.contains("sidebar-hidden");
  toggleSidebar(pastChatsRow, pastChatsToggle);

-  // On desktop, open/close both sidebars at the same time
+  // On desktop, sync both sidebars together
  if (!isMobile()) {
    if (isCurrentlyOpen) {
      // If we just closed the left sidebar, also close the right sidebar
      if (!chatControlsRow.classList.contains("sidebar-hidden")) {
-        toggleSidebar(chatControlsRow, chatControlsToggle, true);
+        toggleSidebar(chatControlsRow, chatControlsToggle);
      }
    } else {
      // If we just opened the left sidebar, also open the right sidebar
      if (chatControlsRow.classList.contains("sidebar-hidden")) {
-        toggleSidebar(chatControlsRow, chatControlsToggle, false);
+        toggleSidebar(chatControlsRow, chatControlsToggle);
      }
    }
  }
@ -860,17 +799,17 @@ chatControlsToggle.addEventListener("click", () => {
  const isCurrentlyOpen = !chatControlsRow.classList.contains("sidebar-hidden");
  toggleSidebar(chatControlsRow, chatControlsToggle);

-  // On desktop, open/close both sidebars at the same time
+  // On desktop, sync both sidebars together
  if (!isMobile()) {
    if (isCurrentlyOpen) {
      // If we just closed the right sidebar, also close the left sidebar
      if (!pastChatsRow.classList.contains("sidebar-hidden")) {
-        toggleSidebar(pastChatsRow, pastChatsToggle, true);
+        toggleSidebar(pastChatsRow, pastChatsToggle);
      }
    } else {
      // If we just opened the right sidebar, also open the left sidebar
      if (pastChatsRow.classList.contains("sidebar-hidden")) {
-        toggleSidebar(pastChatsRow, pastChatsToggle, false);
+        toggleSidebar(pastChatsRow, pastChatsToggle);
      }
    }
  }
@ -890,7 +829,7 @@ if (isMobile()) {
  const textarea = document.querySelector("#chat-input textarea");

  if (textarea) {
-    // Simulate adding and removing a newline
+    // Force textarea height recalculation by simulating content change
    textarea.value += "\n";
    textarea.dispatchEvent(new Event("input", { bubbles: true }));
    textarea.value = textarea.value.slice(0, -1);
--- a/js/save_files.js
+++ b/js/save_files.js
@ -1,10 +1,9 @@
 // Functions for downloading JSON files
 function getCurrentTimestamp() {
  const now = new Date();
-  const timezoneOffset = now.getTimezoneOffset() * 60000; // Convert to milliseconds
+  const timezoneOffset = now.getTimezoneOffset() * 60000; // Convert minutes to milliseconds
  const localTime = new Date(now.getTime() - timezoneOffset);
-  const formattedTimestamp = localTime.toISOString().replace(/[-:]/g, "").slice(0, 15);
-  return formattedTimestamp;
+  return localTime.toISOString().replace(/[-:]/g, "").slice(0, 15);
 }

 function saveFile(contents, filename) {
@ -18,23 +17,18 @@ function saveFile(contents, filename) {
 }

 function saveHistory(history, character, mode) {
-  let path = null;
+  let path;

  if (["chat", "chat-instruct"].includes(mode) && character && character.trim() !== "") {
    path = `history_${character}_${getCurrentTimestamp()}.json`;
  } else {
-    try {
-      path = `history_${mode}_${getCurrentTimestamp()}.json`;
-    } catch (error) {
-      path = `history_${getCurrentTimestamp()}.json`;
-    }
+    path = `history_${mode || "unknown"}_${getCurrentTimestamp()}.json`;
  }
+
  saveFile(history, path);
 }

 function saveSession(session) {
-  let path = null;
-
-  path = `session_${getCurrentTimestamp()}.json`;
+  const path = `session_${getCurrentTimestamp()}.json`;
  saveFile(session, path);
 }
--- a/js/show_controls.js
+++ b/js/show_controls.js
@ -1,13 +1,11 @@
-const chatParent = document.querySelector(".chat-parent");
-
 function toggle_controls(value) {
+  const navToggle = document.getElementById("navigation-toggle");
+  const pastChatsToggle = document.getElementById("past-chats-toggle");
  const extensions = document.querySelector("#extensions");
+  const galleryExtension = document.getElementById("gallery-extension");

  if (value) {
    // SHOW MODE: Click toggles to show hidden sidebars
-    const navToggle = document.getElementById("navigation-toggle");
-    const pastChatsToggle = document.getElementById("past-chats-toggle");
-
    if (navToggle && document.querySelector(".header_bar")?.classList.contains("sidebar-hidden")) {
      navToggle.click();
    }
@ -19,17 +17,11 @@ function toggle_controls(value) {
    if (extensions) {
      extensions.style.display = "inherit";
    }
-
-    let gallery_element = document.getElementById("gallery-extension");
-    if (gallery_element) {
-      gallery_element.style.display = "block";
+    if (galleryExtension) {
+      galleryExtension.style.display = "block";
    }
-
  } else {
    // HIDE MODE: Click toggles to hide visible sidebars
-    const navToggle = document.getElementById("navigation-toggle");
-    const pastChatsToggle = document.getElementById("past-chats-toggle");
-
    if (navToggle && !document.querySelector(".header_bar")?.classList.contains("sidebar-hidden")) {
      navToggle.click();
    }
@ -41,5 +33,8 @@ function toggle_controls(value) {
    if (extensions) {
      extensions.style.display = "none";
    }
+    if (galleryExtension) {
+      galleryExtension.style.display = "none";
+    }
  }
 }
--- a/js/switch_tabs.js
+++ b/js/switch_tabs.js
@ -2,17 +2,9 @@ function scrollToTop() {
  window.scrollTo({ top: 0 });
 }

-function findButtonsByText(buttonText) {
-  const buttons = document.getElementsByTagName("button");
-  const matchingButtons = [];
-
-  for (let i = 0; i < buttons.length; i++) {
-    if (buttons[i].textContent.trim() === buttonText) {
-      matchingButtons.push(buttons[i]);
-    }
-  }
-
-  return matchingButtons;
+function findButtonsByText(buttonText, container = document) {
+  return Array.from(container.getElementsByTagName("button"))
+    .filter(btn => btn.textContent.trim() === buttonText);
 }

 function switch_to_chat() {
@ -39,13 +31,9 @@ function switch_to_character() {

 function switch_to_image_ai_generate() {
  const container = document.querySelector("#image-ai-tab");
-  const buttons = container.getElementsByTagName("button");
-
-  for (let i = 0; i < buttons.length; i++) {
-    if (buttons[i].textContent.trim() === "Generate") {
-      buttons[i].click();
-      break;
-    }
+  const generateBtn = findButtonsByText("Generate", container)[0];
+  if (generateBtn) {
+    generateBtn.click();
  }

  scrollToTop();
--- a/js/update_big_picture.js
+++ b/js/update_big_picture.js
@ -1,7 +1,6 @@
 function updateBigPicture() {
  var existingElement = document.querySelector(".bigProfilePicture");
  if (existingElement) {
-    var timestamp = new Date().getTime();
-    existingElement.src = "/file/user_data/cache/pfp_character.png?time=" + timestamp;
+    existingElement.src = getProfilePictureUrl();
  }
 }
--- a/modules/api/completions.py
+++ b/modules/api/completions.py
@ -39,6 +39,146 @@ def load_chat_template_file(filepath):
    return text


+def _first_token_display_str(token_id, prompt, tokenizer):
+    """Return the display string for the first prompt token.
+
+    Returns empty string for BOS or tokens that don't appear at the start
+    of the prompt text, so they don't shift text_offset for subsequent tokens.
+    """
+    token_id = int(token_id)
+    bos_id = getattr(tokenizer, 'bos_token_id', None)
+    if bos_id is not None and token_id == bos_id:
+        return ""
+
+    import torch
+    tok = tokenizer.decode(torch.tensor([token_id]))
+    if not prompt.startswith(tok):
+        return ""
+
+    return tok
+
+
+def _compute_prompt_logprob_entries(prompt, logprobs_count, input_ids=None):
+    """Compute logprob entries for prompt tokens via a forward pass.
+
+    Returns a list of logprob entries in the standard format.
+    The first token gets a null entry (no conditioning context).
+
+    Supported for HF-compatible loaders (Transformers, ExLlamav3_HF, etc.)
+    via a single forward pass, and for llama.cpp via the server's
+    prompt_logprobs parameter. Returns [] for unsupported loaders.
+    """
+    if input_ids is None:
+        input_ids = encode(prompt)  # (1, seq_len) tensor or array
+
+    token_ids = input_ids[0]
+    n_tokens = len(token_ids)
+
+    if n_tokens == 0:
+        return []
+
+    loader = shared.args.loader
+    model = shared.model
+
+    if loader == 'llama.cpp':
+        return model.get_prompt_logprob_entries(token_ids, max(logprobs_count, 1), prompt=prompt)
+
+    first_token_str = _first_token_display_str(token_ids[0], prompt, shared.tokenizer)
+
+    if n_tokens <= 1:
+        return [{"token": first_token_str, "null_logprob": True}]
+
+    import torch
+    from modules.torch_utils import clear_torch_cache
+
+    if hasattr(model, 'get_prompt_logits'):
+        logits = model.get_prompt_logits(input_ids)
+
+    elif hasattr(model, 'forward'):
+        # HF-compatible loaders (Transformers, ExLlamav3_HF, etc.)
+        input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
+        if hasattr(model, 'device'):
+            input_ids_tensor = input_ids_tensor.to(model.device)
+        with torch.no_grad():
+            # Pass labels to ensure logits are returned for ALL positions,
+            # not just the last token (some HF wrappers like ExLlamav3_HF
+            # only compute the last-token logits when labels are absent).
+            outputs = model(input_ids=input_ids_tensor, labels=input_ids_tensor)
+            logits = outputs.logits  # keep on GPU, (1, seq_len, vocab) in model dtype
+            del outputs
+
+    else:
+        return []
+
+    entries = [{"token": first_token_str, "null_logprob": True}]
+
+    logprobs_count = max(logprobs_count, 1)
+    k = min(logprobs_count, logits.shape[-1])
+    chunk_size = 2048
+    unique_ids = set(int(tid) for tid in token_ids[1:])
+
+    # Process logits in chunks on GPU, only move top-K results to CPU
+    all_top_log_probs_list = []
+    all_top_indices_list = []
+    all_actual_lps = []
+
+    for start in range(0, n_tokens - 1, chunk_size):
+        end = min(start + chunk_size, n_tokens - 1)
+        chunk_logits = logits[0, start:end].float()  # (chunk, vocab) on GPU
+        chunk_lse = torch.logsumexp(chunk_logits, dim=-1)
+        chunk_top_values, chunk_top_indices = torch.topk(chunk_logits, k=k, dim=-1)
+        chunk_top_log_probs = chunk_top_values - chunk_lse.unsqueeze(-1)
+
+        # Compute logprob for actual next tokens in this chunk
+        chunk_top_sets = [set(chunk_top_indices[j].tolist()) for j in range(end - start)]
+        for j in range(end - start):
+            actual_tid = int(token_ids[start + j + 1])
+            if actual_tid not in chunk_top_sets[j]:
+                all_actual_lps.append((chunk_logits[j, actual_tid] - chunk_lse[j]).item())
+            else:
+                all_actual_lps.append(None)  # will use top_log_probs
+
+        all_top_log_probs_list.append(chunk_top_log_probs.cpu())
+        all_top_indices_list.append(chunk_top_indices.cpu())
+        unique_ids.update(int(tid) for tid in chunk_top_indices.flatten().tolist())
+        del chunk_logits, chunk_lse, chunk_top_values
+
+    del logits
+    clear_torch_cache()
+
+    all_top_log_probs = torch.cat(all_top_log_probs_list, dim=0)
+    all_top_indices = torch.cat(all_top_indices_list, dim=0)
+
+    unique_ids_list = sorted(unique_ids)
+    decoded_list = shared.tokenizer.batch_decode([[tid] for tid in unique_ids_list]) if hasattr(shared.tokenizer, 'batch_decode') else [shared.tokenizer.decode(torch.tensor([tid])) for tid in unique_ids_list]
+    decoded_strs = dict(zip(unique_ids_list, decoded_list))
+
+    for i in range(1, n_tokens):
+        token_id = int(token_ids[i])
+        idx = i - 1
+        top_log_probs = all_top_log_probs[idx]
+        top_ids = all_top_indices[idx].tolist()
+        actual_token_str = decoded_strs[token_id]
+
+        if token_id in top_ids:
+            actual_lp = top_log_probs[top_ids.index(token_id)].item()
+            alternatives = [
+                {"token": decoded_strs[top_ids[j]], "token_id": top_ids[j], "logprob": top_log_probs[j].item()}
+                for j in range(k) if top_ids[j] != token_id
+            ]
+        else:
+            actual_lp = all_actual_lps[idx]
+            alternatives = [
+                {"token": decoded_strs[top_ids[j]], "token_id": top_ids[j], "logprob": top_log_probs[j].item()}
+                for j in range(k - 1)
+            ]
+
+        entry = {"top_logprobs": [{"token": actual_token_str, "token_id": token_id, "logprob": actual_lp}] + alternatives}
+        entries.append(entry)
+
+    return entries
+
+
 def _get_raw_logprob_entries(offset=0):
    """Get raw logprob entries from llama.cpp/ExLlamav3 backend, starting from offset.

@ -65,6 +205,21 @@ def _parse_entry_top(entry):
    return entry.get('top_logprobs', entry.get('top_probs', []))


+def _extract_sampled_token(entry, top):
+    """Get the actually sampled token and its logprob from a logprob entry.
+
+    Uses the entry-level token/logprob when available (the actually sampled
+    token), falling back to top[0] (highest-probability alternative) which
+    may differ with non-greedy sampling.
+    """
+    if 'token' in entry:
+        return entry['token'], entry.get('logprob', entry.get('prob', 0))
+
+    token_str = top[0].get('token', '')
+    token_logprob = top[0].get('logprob', top[0].get('prob', 0))
+    return token_str, token_logprob
+
+
 def format_chat_logprobs(entries):
    """Format logprob entries into OpenAI chat completions logprobs format.

@ -79,9 +234,7 @@ def format_chat_logprobs(entries):
        if not top:
            continue

-        chosen = top[0]
-        token_str = chosen.get('token', '')
-        token_logprob = chosen.get('logprob', chosen.get('prob', 0))
+        token_str, token_logprob = _extract_sampled_token(entry, top)

        top_list = []
        for item in top:
@ -106,7 +259,7 @@ def format_chat_logprobs(entries):
 def format_completion_logprobs(entries):
    """Format logprob entries into OpenAI completions logprobs format.

-    Output: {"tokens", "token_logprobs", "top_logprobs": [{token: prob}], "text_offset"}
+    Output: {"tokens", "token_logprobs", "top_logprobs": [{token: prob}], "top_logprobs_ids": [{token_id: prob}], "text_offset"}
    """
    if not entries:
        return None
@ -114,17 +267,27 @@ def format_completion_logprobs(entries):
    tokens = []
    token_logprobs = []
    top_logprobs = []
+    top_logprobs_ids = []
    text_offset = []
    offset = 0

    for entry in entries:
+        # Handle null logprob entries (first prompt token with echo)
+        if entry.get("null_logprob"):
+            token_str = entry.get("token", "")
+            tokens.append(token_str)
+            token_logprobs.append(None)
+            top_logprobs.append(None)
+            top_logprobs_ids.append(None)
+            text_offset.append(offset)
+            offset += len(token_str)
+            continue
+
        top = _parse_entry_top(entry)
        if not top:
            continue

-        chosen = top[0]
-        token_str = chosen.get('token', '')
-        token_logprob = chosen.get('logprob', chosen.get('prob', 0))
+        token_str, token_logprob = _extract_sampled_token(entry, top)

        tokens.append(token_str)
        token_logprobs.append(token_logprob)
@ -132,21 +295,29 @@ def format_completion_logprobs(entries):
        offset += len(token_str)

        top_dict = {}
+        top_dict_ids = {}
        for item in top:
            t = item.get('token', '')
            lp = item.get('logprob', item.get('prob', 0))
            top_dict[t] = lp
+            tid = item.get('token_id', item.get('id'))
+            if tid is not None:
+                top_dict_ids[tid] = lp
        top_logprobs.append(top_dict)
+        top_logprobs_ids.append(top_dict_ids if top_dict_ids else None)

    if not tokens:
        return None

-    return {
+    result = {
        "tokens": tokens,
        "token_logprobs": token_logprobs,
        "top_logprobs": top_logprobs,
        "text_offset": text_offset
    }
+    if any(x is not None for x in top_logprobs_ids):
+        result["top_logprobs_ids"] = top_logprobs_ids
+    return result


 def process_parameters(body, is_legacy=False):
@ -407,7 +578,10 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
    })

    max_tokens = generate_params['max_new_tokens']
-    if max_tokens in [None, 0]:
+    if max_tokens is not None and max_tokens <= 0:
+        raise InvalidRequestError(message="max_tokens must be greater than 0.", param="max_tokens")
+
+    if max_tokens is None:
        generate_params['max_new_tokens'] = 512
        generate_params['auto_max_new_tokens'] = True

@ -652,6 +826,15 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
    # common params
    generate_params = process_parameters(body, is_legacy=is_legacy)
    max_tokens = generate_params['max_new_tokens']
+    if max_tokens is None:
+        generate_params['max_new_tokens'] = 512
+        generate_params['auto_max_new_tokens'] = True
+        max_tokens = 512
+    elif max_tokens < 0:
+        raise InvalidRequestError(message="max_tokens must be greater than or equal to 0.", param="max_tokens")
+    elif max_tokens == 0 and body.get('logprobs') is None:
+        raise InvalidRequestError(message="max_tokens is 0 but no logprobs parameter was specified.", param="max_tokens")
+
    generate_params['stream'] = stream
    if stop_event is not None:
        generate_params['stop_event'] = stop_event
@ -700,9 +883,17 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
                        prompt = decode(prompt)[0]

            prefix = prompt if echo else ''
-            token_count = len(encode(prompt)[0])
+            prompt_input_ids = encode(prompt)
+            token_count = len(prompt_input_ids[0])
            total_prompt_token_count += token_count

+            # Compute prompt logprobs once per prompt (shared across n_completions)
+            logprobs_val = body.get('logprobs', None)
+            if echo and logprobs_val is not None and logprobs_val >= 0:
+                prompt_entries = _compute_prompt_logprob_entries(prompt, logprobs_val, input_ids=prompt_input_ids)
+            else:
+                prompt_entries = None
+
            original_seed = generate_params.get('seed', -1)
            for _n in range(n_completions):
                # Increment seed for each completion to ensure diversity (matches llama.cpp native behavior)
@ -713,29 +904,41 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
                    logprob_proc.token_alternatives_history.clear()

                # generate reply #######################################
-                debug_msg({'prompt': prompt, 'generate_params': generate_params})
-                generator = generate_reply(prompt, generate_params, is_chat=False)
-                answer = ''
-
-                for a in generator:
-                    answer = a
-
-                completion_token_count = len(encode(answer)[0])
-                total_completion_token_count += completion_token_count
-                stop_reason = "stop"
-                if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
-                    stop_reason = "length"
-
-                if logprob_proc:
-                    all_entries = []
-                    for alt in logprob_proc.token_alternatives_history:
-                        all_entries.extend(_dict_to_logprob_entries(alt))
-                    completion_logprobs = format_completion_logprobs(all_entries)
-                elif shared.args.loader in ('llama.cpp', 'ExLlamav3'):
-                    raw = getattr(shared.model, 'last_completion_probabilities', None)
-                    completion_logprobs = format_completion_logprobs(raw)
+                if max_tokens == 0:
+                    answer = ''
+                    completion_token_count = 0
+                    stop_reason = "stop"
                else:
-                    completion_logprobs = None
+                    debug_msg({'prompt': prompt, 'generate_params': generate_params})
+                    generator = generate_reply(prompt, generate_params, is_chat=False)
+                    answer = ''
+
+                    for a in generator:
+                        answer = a
+
+                    completion_token_count = len(encode(answer)[0])
+                    stop_reason = "stop"
+                    if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
+                        stop_reason = "length"
+
+                total_completion_token_count += completion_token_count
+
+                if max_tokens == 0:
+                    all_entries = []
+                else:
+                    if logprob_proc:
+                        all_entries = []
+                        for alt in logprob_proc.token_alternatives_history:
+                            all_entries.extend(_dict_to_logprob_entries(alt))
+                    elif shared.args.loader in ('llama.cpp', 'ExLlamav3'):
+                        all_entries = getattr(shared.model, 'last_completion_probabilities', None) or []
+                    else:
+                        all_entries = []
+
+                if prompt_entries:
+                    all_entries = prompt_entries + all_entries
+
+                completion_logprobs = format_completion_logprobs(all_entries) if all_entries else None

                respi = {
                    "index": choice_index,
@ -775,7 +978,8 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e
                raise InvalidRequestError(message="API Batched generation not yet supported.", param=prompt_str)

        prefix = prompt if echo else ''
-        token_count = len(encode(prompt)[0])
+        prompt_input_ids = encode(prompt)
+        token_count = len(prompt_input_ids[0])

        # Check if usage should be included in streaming chunks per OpenAI spec
        stream_options = body.get('stream_options')
@ -808,37 +1012,57 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False, stop_e

            return chunk

+        logprobs_val = body.get('logprobs', None)
+        if echo and logprobs_val is not None and logprobs_val >= 0:
+            prompt_entries = _compute_prompt_logprob_entries(prompt, logprobs_val, input_ids=prompt_input_ids)
+            prompt_logprobs_formatted = format_completion_logprobs(prompt_entries) if prompt_entries else None
+        else:
+            prompt_logprobs_formatted = None
+
+        # Clear stale logprobs from any previous request before building the
+        # first chunk, so text_streaming_chunk doesn't pick up old data.
+        if hasattr(shared.model, 'last_completion_probabilities'):
+            shared.model.last_completion_probabilities = []
+        cmpl_logprobs_offset[0] = 0
+
        chunk = text_streaming_chunk(prefix)
+        if prompt_logprobs_formatted is not None:
+            chunk[resp_list][0]["logprobs"] = prompt_logprobs_formatted
        if include_usage:
            chunk['usage'] = None
        yield chunk

        # generate reply #######################################
-        debug_msg({'prompt': prompt, 'generate_params': generate_params})
-        generator = generate_reply(prompt, generate_params, is_chat=False)
-        answer = ''
-        seen_content = ''
-        completion_token_count = 0
+        if max_tokens == 0:
+            answer = ''
+            completion_token_count = 0
+            stop_reason = "stop"
+        else:
+            debug_msg({'prompt': prompt, 'generate_params': generate_params})
+            generator = generate_reply(prompt, generate_params, is_chat=False)
+            answer = ''
+            seen_content = ''
+            completion_token_count = 0

-        for a in generator:
-            answer = a
+            for a in generator:
+                answer = a

-            len_seen = len(seen_content)
-            new_content = answer[len_seen:]
+                len_seen = len(seen_content)
+                new_content = answer[len_seen:]

-            if not new_content or chr(0xfffd) in new_content:  # partial unicode character, don't send it yet.
-                continue
+                if not new_content or chr(0xfffd) in new_content:  # partial unicode character, don't send it yet.
+                    continue

-            seen_content = answer
-            chunk = text_streaming_chunk(new_content)
-            if include_usage:
-                chunk['usage'] = None
-            yield chunk
+                seen_content = answer
+                chunk = text_streaming_chunk(new_content)
+                if include_usage:
+                    chunk['usage'] = None
+                yield chunk

-        completion_token_count = len(encode(answer)[0])
-        stop_reason = "stop"
-        if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
-            stop_reason = "length"
+            completion_token_count = len(encode(answer)[0])
+            stop_reason = "stop"
+            if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= max_tokens:
+                stop_reason = "length"

        chunk = text_streaming_chunk(suffix)
        chunk[resp_list][0]["finish_reason"] = stop_reason
--- a/modules/api/models.py
+++ b/modules/api/models.py
@ -68,7 +68,7 @@ def _load_model(data):
            if k in shared.settings:
                shared.settings[k] = settings[k]
                if k == 'truncation_length':
-                    logger.info(f"TRUNCATION LENGTH (UPDATED): {shared.settings['truncation_length']}")
+                    logger.info(f"CONTEXT LENGTH (UPDATED): {shared.settings['truncation_length']}")
                elif k == 'instruction_template':
                    logger.info(f"INSTRUCTION TEMPLATE (UPDATED): {shared.settings['instruction_template']}")

--- a/modules/chat.py
+++ b/modules/chat.py
@ -671,7 +671,10 @@ def get_stopping_strings(state):
    # Handle GPT-OSS as a special case
    if '<|channel|>final<|message|>' in state['instruction_template_str'] and "<|end|>" in result:
        result.remove("<|end|>")
-        result.append("<|result|>")
+        if '<|result|>' in state['instruction_template_str']:
+            result.append("<|result|>")
+        elif '<|return|>' in state['instruction_template_str']:
+            result.append("<|return|>")
        result = list(set(result))

    if shared.args.verbose:
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@ -423,6 +423,15 @@ class Exllamav3Model:
        if logit_bias:
            filters.append(LogitBiasFilter(self.tokenizer, logit_bias))

+        # Suppress EOS tokens via logit bias so they are never sampled
+        if state['ban_eos_token']:
+            eos_bias = {}
+            for eos_id in self.config.eos_token_id_list:
+                if eos_id is not None:
+                    eos_bias[str(eos_id)] = float('-inf')
+            if eos_bias:
+                filters.append(LogitBiasFilter(self.tokenizer, eos_bias))
+
        # Logprobs support (OpenAI API)
        logprobs = state.get('logprobs', 0) or 0
        return_top_tokens = logprobs if logprobs > 0 else 0
@ -480,15 +489,35 @@ class Exllamav3Model:
            return

        id_to_piece = self.tokenizer.get_id_to_piece_list(True)
+        sampled_ids = result.get("token_ids")    # (batch, seq_len) - actually sampled tokens
+        sampled_probs = result.get("token_probs")  # (batch, seq_len) - their probabilities
+
+        def _piece(tid):
+            s = id_to_piece[tid] if tid < len(id_to_piece) else f"<{tid}>"
+            return s.replace('\u2581', ' ')
+
+        def _logprob(prob):
+            return math.log(prob) if prob > 0 else float("-inf")
+
        # top_k_tokens shape: (batch, seq_len, k), top_k_probs same
        for seq_idx in range(top_k_tokens.shape[1]):
            entry = {"top_logprobs": []}
            for k_idx in range(top_k_tokens.shape[2]):
                token_id = top_k_tokens[0, seq_idx, k_idx].item()
                prob = top_k_probs[0, seq_idx, k_idx].item()
-                token_str = id_to_piece[token_id] if token_id < len(id_to_piece) else f"<{token_id}>"
-                logprob = math.log(prob) if prob > 0 else float("-inf")
-                entry["top_logprobs"].append({"token": token_str, "logprob": logprob})
+                entry["top_logprobs"].append({"token": _piece(token_id), "logprob": _logprob(prob)})
+
+            # Record the actually sampled token at the entry level so
+            # format_completion_logprobs uses it instead of top_logprobs[0]
+            # (they differ with non-greedy sampling).
+            if sampled_ids is not None:
+                sid = sampled_ids[0, seq_idx].item()
+                entry["token"] = _piece(sid)
+                if sampled_probs is not None:
+                    entry["logprob"] = _logprob(sampled_probs[0, seq_idx].item())
+                else:
+                    entry["logprob"] = None
+
            self.last_completion_probabilities.append(entry)

    def generate(self, prompt, state):
@ -498,42 +527,31 @@ class Exllamav3Model:

        return output

+    def get_prompt_logits(self, input_ids):
+        """Return logits for all positions via a single no-cache forward pass.
+
+        Used by prompt logprobs computation. Returns (1, seq_len, vocab) on CPU in float32.
+        """
+        import torch
+        input_ids_tensor = input_ids if isinstance(input_ids, torch.Tensor) else torch.tensor(input_ids, dtype=torch.long)
+        input_ids_tensor = input_ids_tensor.view(1, -1).cpu()
+        with torch.no_grad():
+            return self.model.forward(
+                input_ids=input_ids_tensor,
+                params={"attn_mode": "flash_attn_nc"}
+            ).cpu().float()
+
    def get_logits(self, token_ids, **kwargs):
        """
        Process a batch of token_ids and return the logits for the last token.
-        This will reset and overwrite the model's cache.
+        Uses flash_attn_nc (no cache) for correct results with recurrent models.
        """
-        # Initialize a single params dictionary that will be updated in-place
-        params = {
-            "cache": self.cache,
-            "reconstruct": False,
-            "attn_mode": "flash_attn",
-            "batch_shape": (1, self.max_tokens),
-            "past_len": 0
-        }
-        params.update(kwargs)
-
-        # Process prefix tokens to fill the cache and generate recurrent state
-        if token_ids.shape[-1] > 1:
-            prefix_ids = token_ids[:, :-1]
-
-            # This forward call updates the 'params' dict with the recurrent state
-            self.model.forward(
-                input_ids=prefix_ids,
-                params=params
-            )
-
-            # Update past_len for the next call
-            params["past_len"] = prefix_ids.shape[-1]
-
-        # Process the last token, now using the state-filled 'params' dict
-        last_token_ids = token_ids[:, -1:]
        logits = self.model.forward(
-            input_ids=last_token_ids,
-            params=params
+            input_ids=token_ids,
+            params={"attn_mode": "flash_attn_nc"}
        )

-        return logits.float().cpu()
+        return logits[:, -1:, :].float().cpu()

    def encode(self, string, **kwargs):
        add_bos = kwargs.pop('add_bos', True)
--- a/modules/exllamav3_hf.py
+++ b/modules/exllamav3_hf.py
@ -26,6 +26,9 @@ except Exception:
 class Exllamav3HF(PreTrainedModel, GenerationMixin):
    def __init__(self, model_dir):
        hf_config = PretrainedConfig.from_pretrained(model_dir)
+        # Ensure text_config is a proper object, not a dict (fixes qwen3_5_moe + transformers compat)
+        if isinstance(getattr(hf_config, 'text_config', None), dict):
+            hf_config.text_config = PretrainedConfig(**hf_config.text_config)
        super().__init__(hf_config)

        exl3_config = Config.from_directory(model_dir)
@ -199,30 +202,11 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
                }
            ).to(input_ids.device).float()
        else:
-            # Labels path: use cache for cross-chunk attention.
-            tokens_to_process = seq_tensor
-            all_logits = None
-            current_len = 0
-
-            for i in range(0, tokens_to_process.shape[0], max_chunk_size):
-                chunk = tokens_to_process[i:i + max_chunk_size]
-                chunk_logits = self.ex_model.forward(
-                    input_ids=chunk.view(1, -1),
-                    params={
-                        "attn_mode": "flash_attn",
-                        "cache": ex_cache,
-                        "past_len": current_len,
-                        "batch_shape": (1, self.max_tokens),
-                    }
-                ).float()
-                current_len += chunk.shape[0]
-
-                if all_logits is None:
-                    all_logits = chunk_logits
-                else:
-                    all_logits = torch.cat([all_logits, chunk_logits], dim=1)
-
-            logits = all_logits
+            # Labels path: single pass without cache for correct logits
+            logits = self.ex_model.forward(
+                input_ids=seq_tensor.view(1, -1),
+                params={"attn_mode": "flash_attn_nc"}
+            ).float().cpu()

        if is_negative:
            self.past_seq_negative = seq_tensor
--- a/modules/extensions.py
+++ b/modules/extensions.py
@ -191,21 +191,19 @@ def _apply_custom_generate_reply():


 def _apply_custom_css():
-    all_css = ''
-    for extension, _ in iterator():
-        if hasattr(extension, 'custom_css'):
-            all_css += getattr(extension, 'custom_css')()
-
-    return all_css
+    return ''.join(
+        getattr(extension, 'custom_css')()
+        for extension, _ in iterator()
+        if hasattr(extension, 'custom_css')
+    )


 def _apply_custom_js():
-    all_js = ''
-    for extension, _ in iterator():
-        if hasattr(extension, 'custom_js'):
-            all_js += getattr(extension, 'custom_js')()
-
-    return all_js
+    return ''.join(
+        getattr(extension, 'custom_js')()
+        for extension, _ in iterator()
+        if hasattr(extension, 'custom_js')
+    )


 def create_extensions_block():
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@ -11,7 +11,6 @@ import time
 from pathlib import Path
 from typing import Any, List

-import llama_cpp_binaries
 import requests

 from modules import shared
@ -311,8 +310,45 @@ class LlamaServer:
        else:
            raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}")

+    def get_prompt_logprob_entries(self, token_ids, n_probs=5, prompt=""):
+        """Get logprob entries for prompt tokens via a single n_predict=0 request.
+
+        Requires llama.cpp server with prompt_logprobs support.
+        Returns entries in the standard format for format_completion_logprobs().
+        """
+        token_ids_list = token_ids.tolist() if hasattr(token_ids, 'tolist') else list(token_ids)
+
+        url = f"http://127.0.0.1:{self.port}/completion"
+        payload = {
+            "prompt": token_ids_list,
+            "n_predict": 0,
+            "n_probs": n_probs,
+            "prompt_logprobs": True,
+            "stream": False,
+            "cache_prompt": False,
+        }
+
+        response = self.session.post(url, json=payload)
+        result = response.json()
+
+        prompt_probs = result.get("prompt_probabilities", [])
+        if not prompt_probs:
+            return []
+
+        # Null first token (no conditioning context); use empty string for BOS
+        # or tokens that don't appear at the start of the prompt text.
+        first_token_str = self.decode([token_ids_list[0]])
+        if self.bos_token and first_token_str == self.bos_token:
+            first_token_str = ""
+        elif not prompt.startswith(first_token_str):
+            first_token_str = ""
+
+        entries = [{"token": first_token_str, "null_logprob": True}]
+        entries.extend(prompt_probs)
+        return entries
+
    def _get_vocabulary_size(self):
-        """Get and store the model's maximum context length."""
+        """Get and store the model's vocabulary size."""
        url = f"http://127.0.0.1:{self.port}/v1/models"
        response = self.session.get(url).json()

@ -357,7 +393,16 @@ class LlamaServer:
        """Start the llama.cpp server and wait until it's ready."""
        # Determine the server path
        if self.server_path is None:
-            self.server_path = llama_cpp_binaries.get_binary_path()
+            if shared.args.ik:
+                try:
+                    import ik_llama_cpp_binaries
+                except ImportError:
+                    raise ImportError("--ik requires the ik_llama_cpp_binaries package. Install it with: pip install <ik_llama_cpp_binaries wheel URL>")
+
+                self.server_path = ik_llama_cpp_binaries.get_binary_path()
+            else:
+                import llama_cpp_binaries
+                self.server_path = llama_cpp_binaries.get_binary_path()

        # Build the command
        cmd = [
@ -470,6 +515,10 @@ class LlamaServer:
                        else:
                            cmd.append(f"--{flag_item}")

+        # Patch flags for ik_llama.cpp compatibility
+        if shared.args.ik:
+            cmd = _patch_cmd_for_ik(cmd)
+
        env = os.environ.copy()
        if os.name == 'posix':
            current_path = env.get('LD_LIBRARY_PATH', '')
@ -607,3 +656,49 @@ def filter_stderr_with_progress(process_stderr):
            process_stderr.close()
        except Exception:
            pass
+
+
+def _patch_cmd_for_ik(cmd):
+    """
+    Rewrite upstream llama.cpp flags to ik_llama.cpp equivalents:
+      --no-webui           → --webui none
+      --fit off            → (removed)
+      --fit on / --fit-ctx → --fit (bare flag)
+      --fit-target         → --fit-margin
+      --cache-reuse        → (removed, unsupported)
+      --swa-full           → (removed, unsupported)
+    """
+    # Add Hadamard KV cache rotation when using quantized cache types.
+    # This significantly improves quantized cache quality (especially q4_0)
+    # and is a no-op for MLA models like DeepSeek.
+    if shared.args.cache_type in ("q8_0", "q4_0"):
+        cmd += ["-khad", "-vhad"]
+
+    patched = []
+    i = 0
+    while i < len(cmd):
+        arg = cmd[i]
+
+        if arg == "--no-webui":
+            patched += ["--webui", "none"]
+        elif arg == "--fit" and i + 1 < len(cmd) and cmd[i + 1] in ("on", "off"):
+            val = cmd[i + 1]
+            i += 1
+            if val == "on":
+                patched.append("--fit")
+            # "off" → drop entirely
+        elif arg == "--fit-ctx":
+            patched.append("--fit")
+            i += 1  # skip the value
+        elif arg == "--fit-target":
+            patched.append("--fit-margin")
+        elif arg == "--cache-reuse":
+            i += 1  # skip the value
+        elif arg == "--swa-full":
+            pass  # bare flag, just drop it
+        else:
+            patched.append(arg)
+
+        i += 1
+
+    return patched
--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -20,6 +20,7 @@ loaders_and_params = OrderedDict({
        'no_mmap',
        'mlock',
        'numa',
+        'ik',
        'parallel',
        'model_draft',
        'draft_max',
@ -345,6 +346,7 @@ def list_model_elements():
        'spec_ngram_size_m',
        'spec_ngram_min_hits',
        'mmproj',
+        'ik',
    ]


--- a/modules/logits.py
+++ b/modules/logits.py
@ -4,7 +4,6 @@ import numpy as np

 from modules import models, shared
 from modules.logging_colors import logger
-from modules.models import load_model
 from modules.text_generation import generate_reply
 from modules.utils import check_model_loaded

@ -12,8 +11,7 @@ global_scores = None


 def get_next_logits(*args, **kwargs):
-    if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
-        shared.model, shared.tokenizer = load_model(shared.model_name)
+    models.load_model_if_idle_unloaded()

    needs_lock = not args[2]  # use_samplers
    if needs_lock:
--- a/modules/models.py
+++ b/modules/models.py
@ -1,4 +1,5 @@
 import sys
+import threading
 import time

 import modules.shared as shared
@ -7,6 +8,15 @@ from modules.models_settings import get_model_metadata
 from modules.utils import resolve_model_path

 last_generation_time = time.time()
+active_generation_count = 0
+_generation_count_lock = threading.Lock()
+
+
+def load_model_if_idle_unloaded():
+    global last_generation_time
+    if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
+        shared.model, shared.tokenizer = load_model(shared.model_name)
+        last_generation_time = time.time()


 def load_model(model_name, loader=None):
@ -66,8 +76,7 @@ def load_model(model_name, loader=None):

    logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.")
    logger.info(f"LOADER: \"{loader}\"")
-    logger.info(f"TRUNCATION LENGTH: {shared.settings['truncation_length']}")
-    logger.info(f"INSTRUCTION TEMPLATE: \"{metadata['instruction_template']}\"")
+    logger.info(f"CONTEXT LENGTH: {shared.settings['truncation_length']}")
    return model, tokenizer


@ -159,7 +168,10 @@ def unload_model_if_idle():
    while True:
        shared.generation_lock.acquire()
        try:
-            if time.time() - last_generation_time > shared.args.idle_timeout * 60:
+            with _generation_count_lock:
+                is_active = active_generation_count > 0
+
+            if not is_active and time.time() - last_generation_time > shared.args.idle_timeout * 60:
                if shared.model is not None:
                    logger.info("Unloading the model for inactivity.")
                    unload_model(keep_model_name=True)
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@ -23,14 +23,9 @@ def get_fallback_settings():

 def get_model_metadata(model):
    model_path = resolve_model_path(model)
-    model_settings = {}

-    # Get settings from user_data/models/config.yaml and user_data/models/config-user.yaml
-    settings = shared.model_config
-    for pat in settings:
-        if re.match(pat.lower(), Path(model).name.lower()):
-            for k in settings[pat]:
-                model_settings[k] = settings[pat][k]
+    # Fallback settings
+    model_settings = get_fallback_settings()

    path = model_path / 'config.json'
    if path.exists():
--- a/modules/shared.py
+++ b/modules/shared.py
@ -110,6 +110,7 @@ group.add_argument('--numa', action='store_true', help='Activate NUMA task alloc
 group.add_argument('--parallel', type=int, default=1, help='Number of parallel request slots. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set ctx_size to 32768.')
 group.add_argument('--fit-target', type=str, default='512', help='Target VRAM margin per device for auto GPU layers, comma-separated list of values in MiB. A single value is broadcast across all devices.')
 group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Example: "--jinja --rpc 192.168.1.100:50052"')
+group.add_argument('--ik', action='store_true', help='Use ik_llama.cpp instead of upstream llama.cpp. Requires the ik_llama_cpp_binaries package to be installed.')

 # Transformers/Accelerate
 group = parser.add_argument_group('Transformers/Accelerate')
@ -454,17 +455,7 @@ def load_user_config():

 args.loader = fix_loader_name(args.loader)

-# Load model-specific settings
-p = Path(f'{args.model_dir}/config.yaml')
-if p.exists():
-    model_config = yaml.safe_load(open(p, 'r').read())
-else:
-    model_config = {}
-del p
-
-
 # Load custom model-specific settings
 user_config = load_user_config()

-model_config = OrderedDict(model_config)
 user_config = OrderedDict(user_config)
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@ -17,9 +17,7 @@ from modules.utils import check_model_loaded


 def generate_reply(*args, **kwargs):
-    if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
-        from modules.models import load_model
-        shared.model, shared.tokenizer = load_model(shared.model_name)
+    models.load_model_if_idle_unloaded()

    state = args[1] if len(args) > 1 else kwargs.get('state', {})
    use_parallel = (
@ -31,10 +29,16 @@ def generate_reply(*args, **kwargs):
    if not use_parallel:
        shared.generation_lock.acquire()

+    with models._generation_count_lock:
+        models.active_generation_count += 1
+
    try:
        for result in _generate_reply(*args, **kwargs):
            yield result
    finally:
+        with models._generation_count_lock:
+            models.active_generation_count -= 1
+
        models.last_generation_time = time.time()
        if not use_parallel:
            shared.generation_lock.release()
@ -126,7 +130,9 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap

 def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None):
    if shared.tokenizer is None:
-        raise ValueError('No tokenizer is loaded')
+        models.load_model_if_idle_unloaded()
+        if shared.tokenizer is None:
+            raise ValueError('No tokenizer is loaded')

    # llama.cpp case
    if shared.model.__class__.__name__ == 'LlamaServer':
@ -176,7 +182,9 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt

 def decode(output_ids, skip_special_tokens=True):
    if shared.tokenizer is None:
-        raise ValueError('No tokenizer is loaded')
+        models.load_model_if_idle_unloaded()
+        if shared.tokenizer is None:
+            raise ValueError('No tokenizer is loaded')

    return shared.tokenizer.decode(output_ids, skip_special_tokens=skip_special_tokens)

--- a/modules/transformers_loader.py
+++ b/modules/transformers_loader.py
@ -109,7 +109,6 @@ def load_model_HF(model_name):
    params = {
        'low_cpu_mem_usage': True,
        'attn_implementation': shared.args.attn_implementation,
-        'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
    }

    if shared.original_args.trust_remote_code:
@ -120,6 +119,17 @@ def load_model_HF(model_name):

    config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.original_args.trust_remote_code)

+    # Determine torch_dtype: respect --bf16 flag, otherwise autodetect
+    # from model config, but never allow float32.
+    if shared.args.bf16:
+        params['torch_dtype'] = torch.bfloat16
+    else:
+        dtype = getattr(config, 'torch_dtype', None) or getattr(getattr(config, 'text_config', None), 'torch_dtype', None)
+        if dtype in (torch.float16, torch.bfloat16):
+            params['torch_dtype'] = dtype
+        else:
+            params['torch_dtype'] = torch.float16
+
    if 'chatglm' in model_name.lower():
        LoaderClass = AutoModel
    else:
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@ -82,7 +82,7 @@ def create_ui():
                gr.HTML("<div class='sidebar-vertical-separator'></div>")

                shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.')
-                shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Seed-OSS and pre-2507 Qwen3.')
+                shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='For models with thinking support.')

                gr.HTML("<div class='sidebar-vertical-separator'></div>")

--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@ -51,6 +51,9 @@ def create_ui():

                        with gr.Column():
                            shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
+                            if not shared.args.portable:
+                                shared.gradio['ik'] = gr.Checkbox(label="ik", value=shared.args.ik, info='Use ik_llama.cpp instead of upstream llama.cpp.')
+
                            shared.gradio['cpu_moe'] = gr.Checkbox(label="cpu-moe", value=shared.args.cpu_moe, info='Move the experts to the CPU. Saves VRAM on MoE models.')
                            shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
                            shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 bitsandbytes==0.49.*
 datasets
@ -25,14 +25,14 @@ sentencepiece
 tensorboard
 torchao==0.15.*
 trafilatura==2.0.0
-transformers==5.3.*
+transformers==5.5.*
 triton-windows==3.5.1.post24; platform_system == "Windows"
 tqdm
 wandb

 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl

 # API
 flask_cloudflared==0.0.15
@ -40,9 +40,11 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
-https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
+https://github.com/turboderp-org/exllamav3/releases/download/v0.0.28/exllamav3-0.0.28+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
 diffusers==0.37.*
@ -22,14 +22,14 @@ scipy
 sentencepiece
 tensorboard
 torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
 tqdm
 trafilatura==2.0.0
 wandb

 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl

 # API
 flask_cloudflared==0.0.15
@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken

 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
 diffusers==0.37.*
@ -22,14 +22,14 @@ scipy
 sentencepiece
 tensorboard
 torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
 tqdm
 trafilatura==2.0.0
 wandb

 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl

 # API
 flask_cloudflared==0.0.15
@ -37,4 +37,4 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
 diffusers==0.37.*
@ -22,14 +22,14 @@ scipy
 sentencepiece
 tensorboard
 torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
 tqdm
 trafilatura==2.0.0
 wandb

 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl

 # API
 flask_cloudflared==0.0.15
@ -37,4 +37,4 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
 diffusers==0.37.*
@ -22,14 +22,14 @@ scipy
 sentencepiece
 tensorboard
 torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
 tqdm
 trafilatura==2.0.0
 wandb

 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl

 # API
 flask_cloudflared==0.0.15
@ -37,5 +37,7 @@ sse-starlette==1.6.5
 tiktoken

 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@ -1,4 +1,4 @@
-accelerate==1.12.*
+accelerate==1.13.*
 audioop-lts<1.0; python_version >= "3.13"
 datasets
 diffusers==0.37.*
@ -22,14 +22,14 @@ scipy
 sentencepiece
 tensorboard
 torchao==0.15.*
-transformers==5.3.*
+transformers==5.5.*
 tqdm
 trafilatura==2.0.0
 wandb

 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl

 # API
 flask_cloudflared==0.0.15
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm

 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl

 # API
 flask_cloudflared==0.0.15
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm

 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl

 # API
 flask_cloudflared==0.0.15
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken

 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm

 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl

 # API
 flask_cloudflared==0.0.15
@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm

 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl

 # API
 flask_cloudflared==0.0.15
@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm

 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl

 # API
 flask_cloudflared==0.0.15
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken

 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm

 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl

 # API
 flask_cloudflared==0.0.15
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_ik.txt
+++ b/requirements/portable/requirements_ik.txt
@ -0,0 +1,27 @@
+audioop-lts<1.0; python_version >= "3.13"
+fastapi==0.112.4
+huggingface-hub==1.5.*
+jinja2==3.1.6
+markdown
+numpy==2.2.*
+pydantic==2.11.0
+pymupdf==1.27.*
+python-docx==1.1.2
+pyyaml
+requests
+rich
+trafilatura==2.0.0
+tqdm
+
+# Gradio
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+
+# API
+flask_cloudflared==0.0.15
+sse-starlette==1.6.5
+tiktoken
+
+# CUDA wheels
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_ik_cpu_only.txt
+++ b/requirements/portable/requirements_ik_cpu_only.txt
@ -0,0 +1,27 @@
+audioop-lts<1.0; python_version >= "3.13"
+fastapi==0.112.4
+huggingface-hub==1.5.*
+jinja2==3.1.6
+markdown
+numpy==2.2.*
+pydantic==2.11.0
+pymupdf==1.27.*
+python-docx==1.1.2
+pyyaml
+requests
+rich
+trafilatura==2.0.0
+tqdm
+
+# Gradio
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+
+# API
+flask_cloudflared==0.0.15
+sse-starlette==1.6.5
+tiktoken
+
+# ik_llama.cpp (CPU only)
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
--- a/requirements/portable/requirements_ik_cuda131.txt
+++ b/requirements/portable/requirements_ik_cuda131.txt
@ -0,0 +1,27 @@
+audioop-lts<1.0; python_version >= "3.13"
+fastapi==0.112.4
+huggingface-hub==1.5.*
+jinja2==3.1.6
+markdown
+numpy==2.2.*
+pydantic==2.11.0
+pymupdf==1.27.*
+python-docx==1.1.2
+pyyaml
+requests
+rich
+trafilatura==2.0.0
+tqdm
+
+# Gradio
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl
+
+# API
+flask_cloudflared==0.0.15
+sse-starlette==1.6.5
+tiktoken
+
+# CUDA wheels
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/ik_llama_cpp_binaries-0.102.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm

 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl

 # API
 flask_cloudflared==0.0.15
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm

 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio-4.37.2+custom.14-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.14/gradio_client-1.0.2+custom.14-py3-none-any.whl

 # API
 flask_cloudflared==0.0.15
@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken

 # Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.102.0/llama_cpp_binaries-0.102.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/server.py
+++ b/server.py
@ -18,7 +18,6 @@ import modules.extensions as extensions_module
 from modules.LoRA import add_lora_to_model
 from modules.models import load_model, unload_model_if_idle
 from modules.models_settings import (
-    get_fallback_settings,
    get_model_metadata,
    update_model_parameters
 )
@ -271,10 +270,6 @@ if __name__ == "__main__":
    # Apply CLI overrides for image model settings (CLI flags take precedence over saved settings)
    shared.apply_image_model_cli_overrides()

-    # Fallback settings for models
-    shared.model_config['.*'] = get_fallback_settings()
-    shared.model_config.move_to_end('.*', last=False)  # Move to the beginning
-
    # Activate the extensions listed on settings.yaml
    extensions_module.available_extensions = utils.get_available_extensions()
    for extension in shared.settings['default_extensions']:
--- a/user_data/models/config.yaml
+++ b/user_data/models/config.yaml
@ -1,203 +0,0 @@
-.*(llama|alpac|vicuna|guanaco|koala|llava|wizardlm|metharme|pygmalion-7b|pygmalion-2|mythalion|wizard-mega|openbuddy|vigogne|h2ogpt-research|manticore):
-  model_type: 'llama'
-.*(opt-|opt_|opt1|opt3|optfor|galactica|galpaca|pygmalion-350m):
-  model_type: 'opt'
-.*(gpt-j|gptj|gpt4all-j|malion-6b|pygway|pygmalion-6b|dolly-v1):
-  model_type: 'gptj'
-.*(gpt-neox|koalpaca-polyglot|polyglot.*koalpaca|polyglot-ko|polyglot_ko|pythia|stablelm|incite|dolly-v2|polycoder|h2ogpt-oig|h2ogpt-oasst1|h2ogpt-gm):
-  model_type: 'gptneox'
-.*bloom:
-  model_type: 'bloom'
-.*gpt2:
-  model_type: 'gpt2'
-.*falcon:
-  model_type: 'falcon'
-.*mpt:
-  model_type: 'mpt'
-.*(starcoder|starchat):
-  model_type: 'starcoder'
-.*dolly-v2:
-  model_type: 'dollyv2'
-.*replit:
-  model_type: 'replit'
-.*(oasst|openassistant-|stablelm-7b-sft-v7-epoch-3):
-  instruction_template: 'Open Assistant'
-  skip_special_tokens: false
-(?!.*galactica)(?!.*reward).*openassistant:
-  instruction_template: 'Open Assistant'
-  skip_special_tokens: false
-.*galactica:
-  skip_special_tokens: false
-.*dolly-v[0-9]-[0-9]*b:
-  instruction_template: 'Alpaca'
-  skip_special_tokens: false
-.*alpaca-native-4bit:
-  instruction_template: 'Alpaca'
-.*llava:
-  instruction_template: 'LLaVA'
-.*llava.*1.5:
-  instruction_template: 'Vicuna-v1.1'
-.*wizard.*mega:
-  instruction_template: 'Wizard-Mega'
-.*starchat-beta:
-  instruction_template: 'Starchat-Beta'
-(?!.*v0)(?!.*1.1)(?!.*1_1)(?!.*stable)(?!.*chinese).*vicuna:
-  instruction_template: 'Vicuna-v0'
-.*vicuna.*v0:
-  instruction_template: 'Vicuna-v0'
-.*vicuna.*(1.1|1_1|1.3|1_3):
-  instruction_template: 'Vicuna-v1.1'
-.*vicuna.*(1.5|1_5):
-  instruction_template: 'Vicuna-v1.1'
-.*stable.*vicuna:
-  instruction_template: 'StableVicuna'
-(?!.*chat).*chinese-vicuna:
-  instruction_template: 'Alpaca'
-.*chinese-vicuna.*chat:
-  instruction_template: 'Chinese-Vicuna-Chat'
-.*alpaca:
-  instruction_template: 'Alpaca'
-.*koala:
-  instruction_template: 'Koala'
-.*chatglm:
-  instruction_template: 'ChatGLM'
-.*(metharme|pygmalion|mythalion):
-  instruction_template: 'Metharme'
-.*raven:
-  instruction_template: 'RWKV-Raven'
-.*moss-moon.*sft:
-  instruction_template: 'MOSS'
-.*stablelm-tuned:
-  instruction_template: 'StableLM'
-.*galactica.*finetuned:
-  instruction_template: 'Galactica Finetuned'
-.*galactica.*-v2:
-  instruction_template: 'Galactica v2'
-(?!.*finetuned)(?!.*-v2).*galactica:
-  instruction_template: 'Galactica'
-.*guanaco:
-  instruction_template: 'Guanaco non-chat'
-.*baize:
-  instruction_template: 'Baize'
-.*mpt-.*instruct:
-  instruction_template: 'Alpaca'
-.*mpt-.*chat:
-  instruction_template: 'ChatML'
-(?!.*-flan-)(?!.*-t5-).*lamini-:
-  instruction_template: 'Alpaca'
-.*incite.*chat:
-  instruction_template: 'INCITE-Chat'
-.*incite.*instruct:
-  instruction_template: 'INCITE-Instruct'
-.*ziya-:
-  instruction_template: 'Ziya'
-.*koalpaca:
-  instruction_template: 'KoAlpaca'
-.*openbuddy:
-  instruction_template: 'OpenBuddy'
-(?!.*chat).*vigogne:
-  instruction_template: 'Vigogne-Instruct'
-.*vigogne.*chat:
-  instruction_template: 'Vigogne-Chat'
-.*(llama-deus|supercot|llama-natural-instructions|open-llama-0.3t-7b-instruct-dolly-hhrlhf|open-llama-0.3t-7b-open-instruct):
-  instruction_template: 'Alpaca'
-.*bactrian:
-  instruction_template: 'Bactrian'
-.*(h2ogpt-oig-|h2ogpt-oasst1-|h2ogpt-research-oasst1-):
-  instruction_template: 'INCITE-Chat'
-.*h2ogpt-gm-:
-  instruction_template: 'H2O-prompt_answer'
-.*manticore:
-  instruction_template: 'Manticore Chat'
-.*bluemoonrp-(30|13)b:
-  instruction_template: 'Bluemoon'
-.*Nous-Hermes-13b:
-  instruction_template: 'Alpaca'
-.*airoboros:
-  instruction_template: 'Vicuna-v1.1'
-.*airoboros.*1.2:
-  instruction_template: 'Airoboros-v1.2'
-.*alpa(cino|sta):
-  instruction_template: 'Alpaca'
-.*hippogriff:
-  instruction_template: 'Hippogriff'
-.*lazarus:
-  instruction_template: 'Alpaca'
-.*guanaco-.*(7|13|33|65)b:
-  instruction_template: 'Vicuna-v0'
-.*hypermantis:
-  instruction_template: 'Alpaca'
-.*open-llama-.*-open-instruct:
-  instruction_template: 'Alpaca'
-.*starcoder-gpteacher-code-instruct:
-  instruction_template: 'Alpaca'
-.*tulu:
-  instruction_template: 'Tulu'
-.*chronos:
-  instruction_template: 'Alpaca'
-.*samantha:
-  instruction_template: 'Samantha'
-.*wizardcoder:
-  instruction_template: 'Alpaca'
-.*minotaur:
-  instruction_template: 'Manticore Chat'
-.*orca_mini:
-  instruction_template: 'Orca Mini'
-.*(platypus|gplatty|superplatty):
-  instruction_template: 'Alpaca'
-.*(openorca-platypus2):
-  instruction_template: 'OpenOrca-Platypus2'
-.*longchat:
-  instruction_template: 'Vicuna-v1.1'
-.*vicuna-33b:
-  instruction_template: 'Vicuna-v1.1'
-.*redmond-hermes-coder:
-  instruction_template: 'Alpaca'
-.*wizardcoder-15b:
-  instruction_template: 'Alpaca'
-.*wizardlm:
-  instruction_template: 'Vicuna-v1.1'
-.*godzilla:
-  instruction_template: 'Alpaca'
-.*llama(-?)(2|v2).*chat:
-  instruction_template: 'Llama-v2'
-.*newhope:
-  instruction_template: 'NewHope'
-.*stablebeluga2:
-  instruction_template: 'StableBeluga2'
-.*openchat:
-  instruction_template: 'OpenChat'
-.*codellama.*instruct:
-  instruction_template: 'Llama-v2'
-.*(mistral|mixtral).*instruct:
-  instruction_template: 'Mistral'
-.*mistral.*openorca:
-  instruction_template: 'ChatML'
-.*(WizardCoder-Python-34B-V1.0|Phind-CodeLlama-34B-v2|CodeBooga-34B-v0.1):
-  instruction_template: 'Alpaca'
-.*orca-2-(13|7)b:
-  instruction_template: 'ChatML'
-.*openhermes.*mistral:
-  instruction_template: 'ChatML'
-.*Yi-34B-Chat:
-  instruction_template: 'ChatML'
-(dolphin).*:
-  instruction_template: 'ChatML'
-.*synthia:
-  instruction_template: 'Synthia'
-.*(hercules|hyperion):
-  instruction_template: 'ChatML'
-.*command-r:
-  instruction_template: 'Command-R'
-.*xwin-lm-70b-v0.1:
-  instruction_template: 'Vicuna-v1.1'
-.*platypus-yi-34b:
-  instruction_template: 'Vicuna-v1.1'
-.*CausalLM-RP-34B:
-  instruction_template: 'ChatML'
-34b-beta:
-  instruction_template: 'ChatML'
-.*airoboros-3_1-yi-34b-200k:
-  instruction_template: 'Llama-v2'
-.*chatqa:
-  instruction_template: 'NVIDIA-ChatQA'